From aa4402ce9904cc3b6f51f2de920452ab4ffcac9f Mon Sep 17 00:00:00 2001
From: jfrery <jordan.frery@zama.ai>
Date: Thu, 14 Nov 2024 20:34:03 +0100
Subject: [PATCH 1/6] chore: fix hybrid model glwe lora mlp

---
 .gitleaksignore                       |  1 +
 src/concrete/ml/torch/hybrid_model.py | 41 ++++++++-------------------
 2 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/.gitleaksignore b/.gitleaksignore
index 1356df088..a1c87598d 100644
--- a/.gitleaksignore
+++ b/.gitleaksignore
@@ -10,3 +10,4 @@ f41de03048a9ed27946b875e81b34138bb4bb17b:use_case_examples/training/analyze.ipyn
 e2904473898ddd325f245f4faca526a0e9520f49:builders/Dockerfile.zamalang-env:generic-api-key:5
 7d5e885816f1f1e432dd94da38c5c8267292056a:docs/advanced_examples/XGBRegressor.ipynb:aws-access-token:1026
 25c5e7abaa7382520af3fb7a64266e193b1f6a59:poetry.lock:square-access-token:6401
+eebd4bea78f6dd2361baa7f94f68ae4cba8b9fe8:tests/deployment/test_deployment.py:generic-api-key:20
\ No newline at end of file
diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
index ae2b885e8..acf244330 100644
--- a/src/concrete/ml/torch/hybrid_model.py
+++ b/src/concrete/ml/torch/hybrid_model.py
@@ -2,7 +2,6 @@
 
 # pylint: disable=too-many-lines
 import ast
-import contextvars
 import io
 import sys
 import time
@@ -102,13 +101,6 @@ def convert_conv1d_to_linear(layer_or_module):
     return layer_or_module
 
 
-# This module member is instantiated by the Hybrid FHE model
-# when hybrid FHE forward is called and the GLWE backend is available
-_optimized_linear_executor: contextvars.ContextVar[Optional[GLWELinearLayerExecutor]] = (
-    contextvars.ContextVar("optimized_linear_executor")
-)
-
-
 # pylint: disable-next=too-many-instance-attributes
 class RemoteModule(nn.Module):
     """A wrapper class for the modules to be evaluated remotely with FHE."""
@@ -136,6 +128,7 @@ def __init__(
         self.model_name: Optional[str] = model_name
         self.verbose = verbose
         self.optimized_linear_execution = optimized_linear_execution
+        self.executor: Optional[GLWELinearLayerExecutor] = None
 
     def init_fhe_client(
         self, path_to_client: Optional[Path] = None, path_to_keys: Optional[Path] = None
@@ -252,15 +245,10 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
         }:
             assert self.private_q_module is not None
 
-            try:
-                optimized_linear_layer_executor = _optimized_linear_executor.get()
-            except LookupError:
-                optimized_linear_layer_executor = None
-
-            if optimized_linear_layer_executor:
+            if self.executor:
                 # Delegate to the optimized GLWE executor
                 y = torch.Tensor(
-                    optimized_linear_layer_executor.forward(
+                    self.executor.forward(
                         x.detach().numpy(), self.private_q_module, self.fhe_local_mode
                     )
                 )
@@ -269,6 +257,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
                 y = torch.Tensor(
                     self.private_q_module.forward(x.detach().numpy(), fhe=self.fhe_local_mode.value)
                 )
+
         elif self.fhe_local_mode == HybridFHEMode.CALIBRATE:
             # Calling torch + gathering calibration data
             assert self.private_module is not None
@@ -278,14 +267,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
 
         elif self.fhe_local_mode == HybridFHEMode.REMOTE:  # pragma:no cover
             # Remote call
-            try:
-                optimized_linear_layer_executor = _optimized_linear_executor.get()
-            except LookupError:
-                optimized_linear_layer_executor = None
-
-            assert optimized_linear_layer_executor is None, (
-                "Remote optimized linear layers " "are not yet implemented"
-            )
+            assert self.executor is None, "Remote optimized linear layers are not yet implemented"
             y = self.remote_call(x)
 
         elif self.fhe_local_mode == HybridFHEMode.TORCH:
@@ -400,6 +382,7 @@ def __init__(
         self.configuration: Optional[Configuration] = None
         self.model_name = model_name
         self.verbose = verbose
+        self.executor: Optional[GLWELinearLayerExecutor] = None
 
         self._replace_modules()
 
@@ -461,6 +444,7 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
 
         # Validate the FHE mode
         fhe_mode = HybridFHEMode(fhe)
+        self.executor = None
 
         if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear:
             if fhe_mode == HybridFHEMode.SIMULATE:
@@ -476,17 +460,16 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
 
                 # Loading keys from a file could be done here, and the
                 # keys could be passed as arguments to the Executor
-                executor = GLWELinearLayerExecutor()
-
+                self.executor = GLWELinearLayerExecutor()
                 if fhe_mode != HybridFHEMode.DISABLE:
-                    executor.keygen()
+                    self.executor.keygen()
 
-                _optimized_linear_executor.set(executor)
+        # Update executor for all remote modules
+        for module in self.remote_modules.values():
+            module.executor = self.executor
 
         result = self.model(x)
 
-        _optimized_linear_executor.set(None)
-
         return result
 
     def __call__(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:

From f0cd6738192f7793560fbca3a70f6dd3f2653282 Mon Sep 17 00:00:00 2001
From: jfrery <jordan.frery@zama.ai>
Date: Fri, 15 Nov 2024 11:29:08 +0100
Subject: [PATCH 2/6] chore: only run glwe backend when input / output
 dimensions are above 512

---
 src/concrete/ml/torch/hybrid_model.py | 22 +++++++--
 tests/torch/test_hybrid_converter.py  | 67 ++++++++++++++++-----------
 2 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
index acf244330..242065e48 100644
--- a/src/concrete/ml/torch/hybrid_model.py
+++ b/src/concrete/ml/torch/hybrid_model.py
@@ -389,7 +389,7 @@ def __init__(
     def _replace_modules(self):
         """Replace the private modules in the model with remote layers."""
 
-        self._all_layers_are_pure_linear = True
+        self._has_large_linear_layers = True
         for module_name in self.module_names:
             # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858
             # Conv1d introduce reshaping operations which adds more TLU
@@ -404,8 +404,20 @@ def _replace_modules(self):
                 self.private_modules[module_name],
                 (nn.Linear, ForwardModuleLinear, BackwardModuleLinear),
             )
+
+            # Check input dimensions for linear layers
+            # If the input dimension is less than 512 we do not use the GLWE optimization.
+            # Optimal input dimension is 2048, below 512 the performance are too low.
+            if is_pure_linear_layer:
+                module = self.private_modules[module_name]
+                input_dim = module.in_features if hasattr(module, "in_features") else 0
+                output_dim = module.out_features if hasattr(module, "out_features") else 0
+                is_pure_linear_layer = (
+                    is_pure_linear_layer and input_dim >= 512 and output_dim >= 512
+                )
+
             if not is_pure_linear_layer:
-                self._all_layers_are_pure_linear = False
+                self._has_large_linear_layers = False
 
         for module_name in self.module_names:
             # Create the optimized glwe linear layer executor if needed
@@ -415,7 +427,7 @@ def _replace_modules(self):
                 module_name=module_name,
                 model_name=self.model_name,
                 verbose=self.verbose,
-                optimized_linear_execution=self._all_layers_are_pure_linear,
+                optimized_linear_execution=(self._has_large_linear_layers),
             )
 
             self.remote_modules[module_name] = remote_module
@@ -446,7 +458,7 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
         fhe_mode = HybridFHEMode(fhe)
         self.executor = None
 
-        if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear:
+        if _HAS_GLWE_BACKEND and self._has_large_linear_layers:
             if fhe_mode == HybridFHEMode.SIMULATE:
                 raise AssertionError(
                     "When the HybridFHEModel is instantiated with only "
@@ -572,7 +584,7 @@ def compile_model(
                 # If all layers are linear and the GLWE backend is available
                 # then simply quantize the model without compiling with
                 # Concrete Python.
-                if self._all_layers_are_pure_linear and _HAS_GLWE_BACKEND:
+                if self._has_large_linear_layers and _HAS_GLWE_BACKEND:
                     self.private_q_modules[name] = build_quantized_module(
                         self.private_modules[name],
                         calibration_data_tensor,
diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py
index 821903bcf..1f25ac499 100644
--- a/tests/torch/test_hybrid_converter.py
+++ b/tests/torch/test_hybrid_converter.py
@@ -9,7 +9,6 @@
 import pytest
 import torch
 from concrete.fhe import Configuration
-from sklearn.datasets import make_moons
 from sklearn.model_selection import train_test_split
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 
@@ -279,7 +278,7 @@ def test_invalid_model():
         HybridFHEModel(invalid_model, module_names="sub_module")
 
 
-@pytest.mark.parametrize("n_hidden", [512, 2048])
+@pytest.mark.parametrize("n_hidden", [256, 512, 2048])
 def test_hybrid_glwe_correctness(n_hidden):
     """Tests that the GLWE backend produces correct results for the hybrid model."""
 
@@ -295,13 +294,15 @@ def prepare_data(x, y, test_size=0.1, random_state=42):
         y_test = torch.tensor(y_test, dtype=torch.long)
         return x_train, x_test, y_train, y_test
 
-    # Generate synthetic 2D data
-    x1_data, y1_data = make_moons(n_samples=num_samples, noise=0.2, random_state=42)
+    # Generate random data with n_hidden features and n_hidden classes
+    # keeping input and output dimensions equal to n_hidden.
+    x1_data = numpy.random.randn(num_samples, n_hidden)
+    y1_data = numpy.random.randint(0, n_hidden, size=num_samples)  # n_hidden classes
 
     # Prepare data
     x1_train, x1_test, y1_train, y1_test = prepare_data(x1_data, y1_data)
 
-    model = FCSmall(2, torch.nn.ReLU, hidden=n_hidden)
+    model = FCSmall(n_hidden, torch.nn.ReLU, hidden=n_hidden)
     optimizer = torch.optim.Adam(model.parameters())
 
     num_epochs = 100
@@ -325,38 +326,48 @@ def prepare_data(x, y, test_size=0.1, random_state=42):
 
     # This internal flag tells us whether all the layers
     # were linear and were replaced with the GLWE backend
-    assert hybrid_local._all_layers_are_pure_linear  # pylint: disable=protected-access
+    # Check if GLWE optimization should be used based on input dimension
+    should_use_glwe = n_hidden >= 512
+    is_pure_linear = hybrid_local._has_large_linear_layers  # pylint: disable=protected-access
+    assert is_pure_linear == should_use_glwe
 
     hybrid_local.compile_model(x1_train, n_bits=10)
 
     y_qm = hybrid_local(x1_test, fhe="disable").numpy()
     y_hybrid_torch = hybrid_local(x1_test, fhe="torch").detach().numpy()
-    y_glwe = hybrid_local(x1_test, fhe="execute").numpy()
 
-    y1_test = y1_test.numpy()
-    n_correct_fp32 = numpy.sum(numpy.argmax(y_torch, axis=1) == y1_test)
-    n_correct_qm = numpy.sum(numpy.argmax(y_qm, axis=1) == y1_test)
-    n_correct_glwe = numpy.sum(numpy.argmax(y_glwe, axis=1) == y1_test)
+    # Only test GLWE execution if input dimension is >= 512
+    if should_use_glwe:
+        y_glwe = hybrid_local(x1_test, fhe="execute").numpy()
 
-    # These two should be exactly the same
-    assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001))
+        y1_test = y1_test.numpy()
+        n_correct_fp32 = numpy.sum(numpy.argmax(y_torch, axis=1) == y1_test)
+        n_correct_qm = numpy.sum(numpy.argmax(y_qm, axis=1) == y1_test)
+        n_correct_glwe = numpy.sum(numpy.argmax(y_glwe, axis=1) == y1_test)
 
-    # The clear quantization vs fp32 test has more tolerance
-    threshold_fhe = 0.01
+        # These two should be exactly the same
+        assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001))
 
-    diff = numpy.abs(y_torch - y_glwe) > threshold_fhe
-    if numpy.any(diff):
-        print(f"Value discrepancy detected for GLWE backend, with epsilon={threshold_fhe}")
-        print("Model output (torch fp32)", y_torch[diff])
-        print("Model output (glwe)", y_glwe[diff])
-        print("Model output (quantized clear)", y_qm[diff])
+        # The clear quantization vs fp32 test has more tolerance
+        threshold_fhe = 0.01
 
-    assert numpy.all(numpy.allclose(y_qm, y_glwe, rtol=1, atol=threshold_fhe))
-    assert numpy.all(numpy.allclose(y_torch, y_glwe, rtol=1, atol=threshold_fhe))
+        diff = numpy.abs(y_torch - y_glwe) > threshold_fhe
+        if numpy.any(diff):
+            print(f"Value discrepancy detected for GLWE backend, with epsilon={threshold_fhe}")
+            print("Model output (torch fp32)", y_torch[diff])
+            print("Model output (glwe)", y_glwe[diff])
+            print("Model output (quantized clear)", y_qm[diff])
 
-    n_correct_delta_threshold_fhe = 1
-    # Check accuracy between fp32 and glwe
-    assert numpy.abs(n_correct_fp32 - n_correct_glwe) <= n_correct_delta_threshold_fhe
+        assert numpy.all(numpy.allclose(y_qm, y_glwe, rtol=1, atol=threshold_fhe))
+        assert numpy.all(numpy.allclose(y_torch, y_glwe, rtol=1, atol=threshold_fhe))
 
-    # Check accuracy between quantized and glwe
-    assert numpy.abs(n_correct_qm - n_correct_glwe) <= n_correct_delta_threshold_fhe
+        n_correct_delta_threshold_fhe = 1
+        # Check accuracy between fp32 and glwe
+        assert numpy.abs(n_correct_fp32 - n_correct_glwe) <= n_correct_delta_threshold_fhe
+
+        # Check accuracy between quantized and glwe
+        assert numpy.abs(n_correct_qm - n_correct_glwe) <= n_correct_delta_threshold_fhe
+    else:
+        # For non-GLWE cases, just verify the torch outputs match
+        assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001))
+        assert numpy.all(numpy.allclose(y_qm, y_hybrid_torch, rtol=1, atol=0.01))

From 5f73bd6a99c7e96e7a94d3eefb126c81ea327674 Mon Sep 17 00:00:00 2001
From: jfrery <jordan.frery@zama.ai>
Date: Fri, 15 Nov 2024 11:56:12 +0100
Subject: [PATCH 3/6] chore: only create keys and initialise executor once

---
 src/concrete/ml/torch/hybrid_model.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
index 242065e48..da4e0ef03 100644
--- a/src/concrete/ml/torch/hybrid_model.py
+++ b/src/concrete/ml/torch/hybrid_model.py
@@ -456,7 +456,6 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
 
         # Validate the FHE mode
         fhe_mode = HybridFHEMode(fhe)
-        self.executor = None
 
         if _HAS_GLWE_BACKEND and self._has_large_linear_layers:
             if fhe_mode == HybridFHEMode.SIMULATE:
@@ -466,15 +465,12 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
                 )
 
             if fhe_mode in (HybridFHEMode.EXECUTE, HybridFHEMode.REMOTE, HybridFHEMode.DISABLE):
-                # If all layers are pure linear, enable the GLWE optimization for all layers
-                # and generate an encryption and compression key for all layers
-                # as they share crypto-parameters
-
-                # Loading keys from a file could be done here, and the
-                # keys could be passed as arguments to the Executor
-                self.executor = GLWELinearLayerExecutor()
-                if fhe_mode != HybridFHEMode.DISABLE:
-                    self.executor.keygen()
+                # Initialize executor only if not already done
+                if self.executor is None:
+                    self.executor = GLWELinearLayerExecutor()
+                    # Generate keys only if needed and not already done
+                    if fhe_mode != HybridFHEMode.DISABLE:
+                        self.executor.keygen()
 
         # Update executor for all remote modules
         for module in self.remote_modules.values():

From 10138bec854e295910c5377d368028f2e50c9f33 Mon Sep 17 00:00:00 2001
From: jfrery <jordan.frery@zama.ai>
Date: Fri, 15 Nov 2024 13:38:37 +0100
Subject: [PATCH 4/6] chore: fix tests and refresh LoraMLP notebook

---
 docs/advanced_examples/LoraMLP.ipynb   | 88 +++++++++++++-------------
 src/concrete/ml/torch/hybrid_model.py  | 17 ++---
 tests/torch/test_hybrid_converter.py   |  2 +-
 use_case_examples/deployment/README.md | 16 ++---
 4 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/docs/advanced_examples/LoraMLP.ipynb b/docs/advanced_examples/LoraMLP.ipynb
index af17b90fc..7b6dc6e7c 100644
--- a/docs/advanced_examples/LoraMLP.ipynb
+++ b/docs/advanced_examples/LoraMLP.ipynb
@@ -21,7 +21,7 @@
     {
      "data": {
       "text/plain": [
-       "<torch._C.Generator at 0x7fa754b0e250>"
+       "<torch._C.Generator at 0x7ffa268f2530>"
       ]
      },
      "execution_count": 1,
@@ -324,7 +324,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
+      "\r",
       "Training:   0%|          | 0/10 [00:00<?, ?epoch/s]"
      ]
     },
@@ -332,168 +332,168 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:   0%|          | 0/10 [00:30<?, ?epoch/s, Avg Loss=2.3775, Time=30.94s, FHE Mode=execute]"
+      "\r",
+      "Training:   0%|          | 0/10 [00:34<?, ?epoch/s, Epoch=1, Avg Loss=2.3775, Time=34.38s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  10%|█         | 1/10 [00:30<04:38, 30.95s/epoch, Avg Loss=2.3775, Time=30.94s, FHE Mode=execute]"
+      "\r",
+      "Training:  10%|█         | 1/10 [00:34<05:09, 34.38s/epoch, Epoch=1, Avg Loss=2.3775, Time=34.38s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  10%|█         | 1/10 [01:01<04:38, 30.95s/epoch, Avg Loss=1.6292, Time=30.71s, FHE Mode=execute]"
+      "\r",
+      "Training:  10%|█         | 1/10 [01:07<05:09, 34.38s/epoch, Epoch=2, Avg Loss=1.6292, Time=32.99s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  20%|██        | 2/10 [01:01<04:06, 30.81s/epoch, Avg Loss=1.6292, Time=30.71s, FHE Mode=execute]"
+      "\r",
+      "Training:  20%|██        | 2/10 [01:07<04:28, 33.56s/epoch, Epoch=2, Avg Loss=1.6292, Time=32.99s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  20%|██        | 2/10 [01:32<04:06, 30.81s/epoch, Avg Loss=0.8214, Time=30.98s, FHE Mode=execute]"
+      "\r",
+      "Training:  20%|██        | 2/10 [01:39<04:28, 33.56s/epoch, Epoch=3, Avg Loss=0.8214, Time=31.86s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  30%|███       | 3/10 [01:32<03:36, 30.89s/epoch, Avg Loss=0.8214, Time=30.98s, FHE Mode=execute]"
+      "\r",
+      "Training:  30%|███       | 3/10 [01:39<03:49, 32.79s/epoch, Epoch=3, Avg Loss=0.8214, Time=31.86s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  30%|███       | 3/10 [02:03<03:36, 30.89s/epoch, Avg Loss=0.5415, Time=30.74s, FHE Mode=execute]"
+      "\r",
+      "Training:  30%|███       | 3/10 [02:10<03:49, 32.79s/epoch, Epoch=4, Avg Loss=0.5415, Time=31.45s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  40%|████      | 4/10 [02:03<03:04, 30.83s/epoch, Avg Loss=0.5415, Time=30.74s, FHE Mode=execute]"
+      "\r",
+      "Training:  40%|████      | 4/10 [02:10<03:13, 32.26s/epoch, Epoch=4, Avg Loss=0.5415, Time=31.45s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  40%|████      | 4/10 [02:34<03:04, 30.83s/epoch, Avg Loss=0.3884, Time=30.87s, FHE Mode=execute]"
+      "\r",
+      "Training:  40%|████      | 4/10 [02:42<03:13, 32.26s/epoch, Epoch=5, Avg Loss=0.3884, Time=31.78s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  50%|█████     | 5/10 [02:34<02:34, 30.85s/epoch, Avg Loss=0.3884, Time=30.87s, FHE Mode=execute]"
+      "\r",
+      "Training:  50%|█████     | 5/10 [02:42<02:40, 32.09s/epoch, Epoch=5, Avg Loss=0.3884, Time=31.78s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  50%|█████     | 5/10 [03:05<02:34, 30.85s/epoch, Avg Loss=0.3246, Time=30.80s, FHE Mode=execute]"
+      "\r",
+      "Training:  50%|█████     | 5/10 [03:14<02:40, 32.09s/epoch, Epoch=6, Avg Loss=0.3246, Time=32.02s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  60%|██████    | 6/10 [03:05<02:03, 30.83s/epoch, Avg Loss=0.3246, Time=30.80s, FHE Mode=execute]"
+      "\r",
+      "Training:  60%|██████    | 6/10 [03:14<02:08, 32.07s/epoch, Epoch=6, Avg Loss=0.3246, Time=32.02s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  60%|██████    | 6/10 [03:35<02:03, 30.83s/epoch, Avg Loss=0.3145, Time=30.63s, FHE Mode=execute]"
+      "\r",
+      "Training:  60%|██████    | 6/10 [03:45<02:08, 32.07s/epoch, Epoch=7, Avg Loss=0.3145, Time=31.47s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  70%|███████   | 7/10 [03:35<01:32, 30.77s/epoch, Avg Loss=0.3145, Time=30.63s, FHE Mode=execute]"
+      "\r",
+      "Training:  70%|███████   | 7/10 [03:45<01:35, 31.87s/epoch, Epoch=7, Avg Loss=0.3145, Time=31.47s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  70%|███████   | 7/10 [04:06<01:32, 30.77s/epoch, Avg Loss=0.2942, Time=30.63s, FHE Mode=execute]"
+      "\r",
+      "Training:  70%|███████   | 7/10 [04:17<01:35, 31.87s/epoch, Epoch=8, Avg Loss=0.2942, Time=31.38s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  80%|████████  | 8/10 [04:06<01:01, 30.73s/epoch, Avg Loss=0.2942, Time=30.63s, FHE Mode=execute]"
+      "\r",
+      "Training:  80%|████████  | 8/10 [04:17<01:03, 31.72s/epoch, Epoch=8, Avg Loss=0.2942, Time=31.38s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  80%|████████  | 8/10 [04:36<01:01, 30.73s/epoch, Avg Loss=0.2913, Time=30.59s, FHE Mode=execute]"
+      "\r",
+      "Training:  80%|████████  | 8/10 [04:49<01:03, 31.72s/epoch, Epoch=9, Avg Loss=0.2913, Time=31.65s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  90%|█████████ | 9/10 [04:36<00:30, 30.68s/epoch, Avg Loss=0.2913, Time=30.59s, FHE Mode=execute]"
+      "\r",
+      "Training:  90%|█████████ | 9/10 [04:49<00:31, 31.70s/epoch, Epoch=9, Avg Loss=0.2913, Time=31.65s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training:  90%|█████████ | 9/10 [05:06<00:30, 30.68s/epoch, Avg Loss=0.2978, Time=29.99s, FHE Mode=execute]"
+      "\r",
+      "Training:  90%|█████████ | 9/10 [05:20<00:31, 31.70s/epoch, Epoch=10, Avg Loss=0.2978, Time=31.63s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training: 100%|██████████| 10/10 [05:06<00:00, 30.47s/epoch, Avg Loss=0.2978, Time=29.99s, FHE Mode=execute]"
+      "\r",
+      "Training: 100%|██████████| 10/10 [05:20<00:00, 31.68s/epoch, Epoch=10, Avg Loss=0.2978, Time=31.63s, FHE Mode=execute]"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\r\n",
-      "Training: 100%|██████████| 10/10 [05:06<00:00, 30.69s/epoch, Avg Loss=0.2978, Time=29.99s, FHE Mode=execute]"
+      "\r",
+      "Training: 100%|██████████| 10/10 [05:20<00:00, 32.06s/epoch, Epoch=10, Avg Loss=0.2978, Time=31.63s, FHE Mode=execute]"
      ]
     },
     {
diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
index da4e0ef03..6af89f2d6 100644
--- a/src/concrete/ml/torch/hybrid_model.py
+++ b/src/concrete/ml/torch/hybrid_model.py
@@ -389,7 +389,7 @@ def __init__(
     def _replace_modules(self):
         """Replace the private modules in the model with remote layers."""
 
-        self._has_large_linear_layers = True
+        self._has_only_large_linear_layers = True
         for module_name in self.module_names:
             # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858
             # Conv1d introduce reshaping operations which adds more TLU
@@ -417,7 +417,7 @@ def _replace_modules(self):
                 )
 
             if not is_pure_linear_layer:
-                self._has_large_linear_layers = False
+                self._has_only_large_linear_layers = False
 
         for module_name in self.module_names:
             # Create the optimized glwe linear layer executor if needed
@@ -427,7 +427,7 @@ def _replace_modules(self):
                 module_name=module_name,
                 model_name=self.model_name,
                 verbose=self.verbose,
-                optimized_linear_execution=(self._has_large_linear_layers),
+                optimized_linear_execution=(self._has_only_large_linear_layers),
             )
 
             self.remote_modules[module_name] = remote_module
@@ -457,7 +457,7 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
         # Validate the FHE mode
         fhe_mode = HybridFHEMode(fhe)
 
-        if _HAS_GLWE_BACKEND and self._has_large_linear_layers:
+        if _HAS_GLWE_BACKEND and self._has_only_large_linear_layers:
             if fhe_mode == HybridFHEMode.SIMULATE:
                 raise AssertionError(
                     "When the HybridFHEModel is instantiated with only "
@@ -468,9 +468,10 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor:
                 # Initialize executor only if not already done
                 if self.executor is None:
                     self.executor = GLWELinearLayerExecutor()
-                    # Generate keys only if needed and not already done
-                    if fhe_mode != HybridFHEMode.DISABLE:
-                        self.executor.keygen()
+
+                # Generate keys only if needed and not already done
+                if fhe_mode != HybridFHEMode.DISABLE and self.executor.private_key is None:
+                    self.executor.keygen()
 
         # Update executor for all remote modules
         for module in self.remote_modules.values():
@@ -580,7 +581,7 @@ def compile_model(
                 # If all layers are linear and the GLWE backend is available
                 # then simply quantize the model without compiling with
                 # Concrete Python.
-                if self._has_large_linear_layers and _HAS_GLWE_BACKEND:
+                if self._has_only_large_linear_layers and _HAS_GLWE_BACKEND:
                     self.private_q_modules[name] = build_quantized_module(
                         self.private_modules[name],
                         calibration_data_tensor,
diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py
index 1f25ac499..de5724c6d 100644
--- a/tests/torch/test_hybrid_converter.py
+++ b/tests/torch/test_hybrid_converter.py
@@ -328,7 +328,7 @@ def prepare_data(x, y, test_size=0.1, random_state=42):
     # were linear and were replaced with the GLWE backend
     # Check if GLWE optimization should be used based on input dimension
     should_use_glwe = n_hidden >= 512
-    is_pure_linear = hybrid_local._has_large_linear_layers  # pylint: disable=protected-access
+    is_pure_linear = hybrid_local._has_only_large_linear_layers  # pylint: disable=protected-access
     assert is_pure_linear == should_use_glwe
 
     hybrid_local.compile_model(x1_train, n_bits=10)
diff --git a/use_case_examples/deployment/README.md b/use_case_examples/deployment/README.md
index 38baac25b..fed7b713e 100644
--- a/use_case_examples/deployment/README.md
+++ b/use_case_examples/deployment/README.md
@@ -7,26 +7,26 @@ This folder contains examples of how to deploy Concrete ML models using Fully Ho
 The deployment process generally follows these steps:
 
 1. Train the model (optional, depending on the use case)
-2. Compile the model to an FHE circuit
-3. Deploy the model using Docker
-4. Run inference using a client (locally or in Docker)
+1. Compile the model to an FHE circuit
+1. Deploy the model using Docker
+1. Run inference using a client (locally or in Docker)
 
 ## Available Examples
 
 We provide three different use cases to demonstrate the deployment process:
 
 1. [Breast Cancer Classification](./breast_cancer/README.md)
-2. [Sentiment Analysis](./sentiment_analysis/README.md)
-3. [CIFAR-10 Image Classification](./cifar/README.md)
+1. [Sentiment Analysis](./sentiment_analysis/README.md)
+1. [CIFAR-10 Image Classification](./cifar/README.md)
 
 ## Getting Started
 
 Each example folder contains its own README with specific instructions. However, the general process is similar:
 
 1. Train or compile the model using the provided scripts
-2. Deploy the model using `deploy_to_docker.py` from the `server` folder
-3. Build the client Docker image
-4. Run the client to interact with the deployed model
+1. Deploy the model using `deploy_to_docker.py` from the `server` folder
+1. Build the client Docker image
+1. Run the client to interact with the deployed model
 
 For detailed instructions, please refer to the README in each example folder.
 

From c5e1e283f2a59728b3e663cb4c92283ac1aaee56 Mon Sep 17 00:00:00 2001
From: jfrery <jordan.frery@zama.ai>
Date: Tue, 19 Nov 2024 10:19:09 +0100
Subject: [PATCH 5/6] chore: no save circuit with glwe backend

---
 src/concrete/ml/torch/hybrid_model.py | 21 +++++++++++++++++++--
 tests/torch/test_hybrid_converter.py  | 18 +++++++++++++++---
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py
index 6af89f2d6..e8bbf6c34 100644
--- a/src/concrete/ml/torch/hybrid_model.py
+++ b/src/concrete/ml/torch/hybrid_model.py
@@ -267,6 +267,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]:
 
         elif self.fhe_local_mode == HybridFHEMode.REMOTE:  # pragma:no cover
             # Remote call
+            # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4672
             assert self.executor is None, "Remote optimized linear layers are not yet implemented"
             y = self.remote_call(x)
 
@@ -410,8 +411,15 @@ def _replace_modules(self):
             # Optimal input dimension is 2048, below 512 the performance are too low.
             if is_pure_linear_layer:
                 module = self.private_modules[module_name]
-                input_dim = module.in_features if hasattr(module, "in_features") else 0
-                output_dim = module.out_features if hasattr(module, "out_features") else 0
+                # Use weight shape instead of in/out_features
+                if hasattr(module, "weight"):
+                    input_dim = module.weight.shape[
+                        1
+                    ]  # Input dimension is second dimension for Linear layers
+                    output_dim = module.weight.shape[0]  # Output dimension is first dimension
+                else:
+                    input_dim = output_dim = 0
+
                 is_pure_linear_layer = (
                     is_pure_linear_layer and input_dim >= 512 and output_dim >= 512
                 )
@@ -582,6 +590,7 @@ def compile_model(
                 # then simply quantize the model without compiling with
                 # Concrete Python.
                 if self._has_only_large_linear_layers and _HAS_GLWE_BACKEND:
+                    self.executor = GLWELinearLayerExecutor()
                     self.private_q_modules[name] = build_quantized_module(
                         self.private_modules[name],
                         calibration_data_tensor,
@@ -637,7 +646,15 @@ def save_and_clear_private_info(self, path: Path, via_mlir=True):
             path (Path): The directory where the model and the FHE circuit will be saved.
             via_mlir (bool): if fhe circuits should be serialized using via_mlir option
                 useful for cross-platform (compile on one architecture and run on another)
+
+        Raises:
+            NotImplementedError: GLWE backend deployment is not yet supported
         """
+        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4672
+        # GLWE backend deployment is not yet supported
+        if self.executor is not None:
+            raise NotImplementedError("GLWE backend deployment is not yet supported")
+
         path = Path(path)
         path.mkdir(parents=True, exist_ok=True)
 
diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py
index de5724c6d..110d5d5b2 100644
--- a/tests/torch/test_hybrid_converter.py
+++ b/tests/torch/test_hybrid_converter.py
@@ -73,6 +73,7 @@ def run_hybrid_llm_test(
 
         # Create a hybrid model
         hybrid_model = HybridFHEModel(model, module_names)
+        is_compiled = False
         try:
             hybrid_model.compile_model(
                 inputs,
@@ -81,6 +82,7 @@ def run_hybrid_llm_test(
                 rounding_threshold_bits=8,
                 configuration=configuration,
             )
+            is_compiled = True
         except RuntimeError as error:
             # When reshaping adds PBSs we sometimes encounter NoParametersFound
             # when compiling. In this case we skip the rest since we can't simulate
@@ -153,10 +155,20 @@ def run_hybrid_llm_test(
         # Get the temp directory path
 
         if not has_pbs and glwe_backend_installed:
-            # Deployment of GLWE backend hybrid models is not yet supported
-            with pytest.raises(AttributeError, match="The quantized module is not compiled.*"):
-                hybrid_model.save_and_clear_private_info(temp_dir_path)
 
+            if is_compiled:
+                # Deployment of GLWE backend hybrid models is not yet supported
+                with pytest.raises(
+                    NotImplementedError, match="GLWE backend deployment is not yet supported"
+                ):
+                    hybrid_model.save_and_clear_private_info(temp_dir_path)
+            else:
+                # Check that we get an error when trying to save a non-compiled model
+                with pytest.raises(
+                    AttributeError,
+                    match="The quantized module is not compiled. Please run compile*",
+                ):
+                    hybrid_model.save_and_clear_private_info(temp_dir_path)
         else:
             hybrid_model.save_and_clear_private_info(temp_dir_path)
 

From c4a68e48cc83f099340b1d7b375cc694b1f057d6 Mon Sep 17 00:00:00 2001
From: jfrery <jordan.frery@zama.ai>
Date: Wed, 20 Nov 2024 16:47:54 +0100
Subject: [PATCH 6/6] chore: fix patch

---
 tests/torch/test_hybrid_converter.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py
index 110d5d5b2..67af03037 100644
--- a/tests/torch/test_hybrid_converter.py
+++ b/tests/torch/test_hybrid_converter.py
@@ -1,5 +1,6 @@
 """Tests for the hybrid model converter."""
 
+import importlib
 import sys
 import tempfile
 from pathlib import Path
@@ -64,14 +65,14 @@ def run_hybrid_llm_test(
             if has_pbs_reshape:
                 has_pbs = True
 
-        # Propagate glwe_backend_installed state being tested to constants of affected modules
-        for affected_module in (
-            concrete.ml.quantization.linear_op_glwe_backend,
-            concrete.ml.torch.hybrid_model,
-        ):
-            m.setattr(affected_module, "_HAS_GLWE_BACKEND", glwe_backend_installed)
+        # Patching for GLWE backend
+        if not glwe_backend_installed:
+            m.setitem(sys.modules, "concrete_ml_extensions", None)
+
+        # Reload the affected modules to ensure the changes take effect
+        importlib.reload(concrete.ml.quantization.linear_op_glwe_backend)
+        importlib.reload(concrete.ml.torch.hybrid_model)
 
-        # Create a hybrid model
         hybrid_model = HybridFHEModel(model, module_names)
         is_compiled = False
         try: