From aa4402ce9904cc3b6f51f2de920452ab4ffcac9f Mon Sep 17 00:00:00 2001 From: jfrery Date: Thu, 14 Nov 2024 20:34:03 +0100 Subject: [PATCH 1/6] chore: fix hybrid model glwe lora mlp --- .gitleaksignore | 1 + src/concrete/ml/torch/hybrid_model.py | 41 ++++++++------------------- 2 files changed, 13 insertions(+), 29 deletions(-) diff --git a/.gitleaksignore b/.gitleaksignore index 1356df088..a1c87598d 100644 --- a/.gitleaksignore +++ b/.gitleaksignore @@ -10,3 +10,4 @@ f41de03048a9ed27946b875e81b34138bb4bb17b:use_case_examples/training/analyze.ipyn e2904473898ddd325f245f4faca526a0e9520f49:builders/Dockerfile.zamalang-env:generic-api-key:5 7d5e885816f1f1e432dd94da38c5c8267292056a:docs/advanced_examples/XGBRegressor.ipynb:aws-access-token:1026 25c5e7abaa7382520af3fb7a64266e193b1f6a59:poetry.lock:square-access-token:6401 +eebd4bea78f6dd2361baa7f94f68ae4cba8b9fe8:tests/deployment/test_deployment.py:generic-api-key:20 \ No newline at end of file diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index ae2b885e8..acf244330 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -2,7 +2,6 @@ # pylint: disable=too-many-lines import ast -import contextvars import io import sys import time @@ -102,13 +101,6 @@ def convert_conv1d_to_linear(layer_or_module): return layer_or_module -# This module member is instantiated by the Hybrid FHE model -# when hybrid FHE forward is called and the GLWE backend is available -_optimized_linear_executor: contextvars.ContextVar[Optional[GLWELinearLayerExecutor]] = ( - contextvars.ContextVar("optimized_linear_executor") -) - - # pylint: disable-next=too-many-instance-attributes class RemoteModule(nn.Module): """A wrapper class for the modules to be evaluated remotely with FHE.""" @@ -136,6 +128,7 @@ def __init__( self.model_name: Optional[str] = model_name self.verbose = verbose self.optimized_linear_execution = optimized_linear_execution + self.executor: Optional[GLWELinearLayerExecutor] = None def init_fhe_client( self, path_to_client: Optional[Path] = None, path_to_keys: Optional[Path] = None @@ -252,15 +245,10 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: }: assert self.private_q_module is not None - try: - optimized_linear_layer_executor = _optimized_linear_executor.get() - except LookupError: - optimized_linear_layer_executor = None - - if optimized_linear_layer_executor: + if self.executor: # Delegate to the optimized GLWE executor y = torch.Tensor( - optimized_linear_layer_executor.forward( + self.executor.forward( x.detach().numpy(), self.private_q_module, self.fhe_local_mode ) ) @@ -269,6 +257,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: y = torch.Tensor( self.private_q_module.forward(x.detach().numpy(), fhe=self.fhe_local_mode.value) ) + elif self.fhe_local_mode == HybridFHEMode.CALIBRATE: # Calling torch + gathering calibration data assert self.private_module is not None @@ -278,14 +267,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: elif self.fhe_local_mode == HybridFHEMode.REMOTE: # pragma:no cover # Remote call - try: - optimized_linear_layer_executor = _optimized_linear_executor.get() - except LookupError: - optimized_linear_layer_executor = None - - assert optimized_linear_layer_executor is None, ( - "Remote optimized linear layers " "are not yet implemented" - ) + assert self.executor is None, "Remote optimized linear layers are not yet implemented" y = self.remote_call(x) elif self.fhe_local_mode == HybridFHEMode.TORCH: @@ -400,6 +382,7 @@ def __init__( self.configuration: Optional[Configuration] = None self.model_name = model_name self.verbose = verbose + self.executor: Optional[GLWELinearLayerExecutor] = None self._replace_modules() @@ -461,6 +444,7 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: # Validate the FHE mode fhe_mode = HybridFHEMode(fhe) + self.executor = None if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear: if fhe_mode == HybridFHEMode.SIMULATE: @@ -476,17 +460,16 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: # Loading keys from a file could be done here, and the # keys could be passed as arguments to the Executor - executor = GLWELinearLayerExecutor() - + self.executor = GLWELinearLayerExecutor() if fhe_mode != HybridFHEMode.DISABLE: - executor.keygen() + self.executor.keygen() - _optimized_linear_executor.set(executor) + # Update executor for all remote modules + for module in self.remote_modules.values(): + module.executor = self.executor result = self.model(x) - _optimized_linear_executor.set(None) - return result def __call__(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: From f0cd6738192f7793560fbca3a70f6dd3f2653282 Mon Sep 17 00:00:00 2001 From: jfrery Date: Fri, 15 Nov 2024 11:29:08 +0100 Subject: [PATCH 2/6] chore: only run glwe backend when input / output dimensions are above 512 --- src/concrete/ml/torch/hybrid_model.py | 22 +++++++-- tests/torch/test_hybrid_converter.py | 67 ++++++++++++++++----------- 2 files changed, 56 insertions(+), 33 deletions(-) diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index acf244330..242065e48 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -389,7 +389,7 @@ def __init__( def _replace_modules(self): """Replace the private modules in the model with remote layers.""" - self._all_layers_are_pure_linear = True + self._has_large_linear_layers = True for module_name in self.module_names: # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858 # Conv1d introduce reshaping operations which adds more TLU @@ -404,8 +404,20 @@ def _replace_modules(self): self.private_modules[module_name], (nn.Linear, ForwardModuleLinear, BackwardModuleLinear), ) + + # Check input dimensions for linear layers + # If the input dimension is less than 512 we do not use the GLWE optimization. + # Optimal input dimension is 2048, below 512 the performance are too low. + if is_pure_linear_layer: + module = self.private_modules[module_name] + input_dim = module.in_features if hasattr(module, "in_features") else 0 + output_dim = module.out_features if hasattr(module, "out_features") else 0 + is_pure_linear_layer = ( + is_pure_linear_layer and input_dim >= 512 and output_dim >= 512 + ) + if not is_pure_linear_layer: - self._all_layers_are_pure_linear = False + self._has_large_linear_layers = False for module_name in self.module_names: # Create the optimized glwe linear layer executor if needed @@ -415,7 +427,7 @@ def _replace_modules(self): module_name=module_name, model_name=self.model_name, verbose=self.verbose, - optimized_linear_execution=self._all_layers_are_pure_linear, + optimized_linear_execution=(self._has_large_linear_layers), ) self.remote_modules[module_name] = remote_module @@ -446,7 +458,7 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: fhe_mode = HybridFHEMode(fhe) self.executor = None - if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear: + if _HAS_GLWE_BACKEND and self._has_large_linear_layers: if fhe_mode == HybridFHEMode.SIMULATE: raise AssertionError( "When the HybridFHEModel is instantiated with only " @@ -572,7 +584,7 @@ def compile_model( # If all layers are linear and the GLWE backend is available # then simply quantize the model without compiling with # Concrete Python. - if self._all_layers_are_pure_linear and _HAS_GLWE_BACKEND: + if self._has_large_linear_layers and _HAS_GLWE_BACKEND: self.private_q_modules[name] = build_quantized_module( self.private_modules[name], calibration_data_tensor, diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py index 821903bcf..1f25ac499 100644 --- a/tests/torch/test_hybrid_converter.py +++ b/tests/torch/test_hybrid_converter.py @@ -9,7 +9,6 @@ import pytest import torch from concrete.fhe import Configuration -from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from transformers import GPT2LMHeadModel, GPT2Tokenizer @@ -279,7 +278,7 @@ def test_invalid_model(): HybridFHEModel(invalid_model, module_names="sub_module") -@pytest.mark.parametrize("n_hidden", [512, 2048]) +@pytest.mark.parametrize("n_hidden", [256, 512, 2048]) def test_hybrid_glwe_correctness(n_hidden): """Tests that the GLWE backend produces correct results for the hybrid model.""" @@ -295,13 +294,15 @@ def prepare_data(x, y, test_size=0.1, random_state=42): y_test = torch.tensor(y_test, dtype=torch.long) return x_train, x_test, y_train, y_test - # Generate synthetic 2D data - x1_data, y1_data = make_moons(n_samples=num_samples, noise=0.2, random_state=42) + # Generate random data with n_hidden features and n_hidden classes + # keeping input and output dimensions equal to n_hidden. + x1_data = numpy.random.randn(num_samples, n_hidden) + y1_data = numpy.random.randint(0, n_hidden, size=num_samples) # n_hidden classes # Prepare data x1_train, x1_test, y1_train, y1_test = prepare_data(x1_data, y1_data) - model = FCSmall(2, torch.nn.ReLU, hidden=n_hidden) + model = FCSmall(n_hidden, torch.nn.ReLU, hidden=n_hidden) optimizer = torch.optim.Adam(model.parameters()) num_epochs = 100 @@ -325,38 +326,48 @@ def prepare_data(x, y, test_size=0.1, random_state=42): # This internal flag tells us whether all the layers # were linear and were replaced with the GLWE backend - assert hybrid_local._all_layers_are_pure_linear # pylint: disable=protected-access + # Check if GLWE optimization should be used based on input dimension + should_use_glwe = n_hidden >= 512 + is_pure_linear = hybrid_local._has_large_linear_layers # pylint: disable=protected-access + assert is_pure_linear == should_use_glwe hybrid_local.compile_model(x1_train, n_bits=10) y_qm = hybrid_local(x1_test, fhe="disable").numpy() y_hybrid_torch = hybrid_local(x1_test, fhe="torch").detach().numpy() - y_glwe = hybrid_local(x1_test, fhe="execute").numpy() - y1_test = y1_test.numpy() - n_correct_fp32 = numpy.sum(numpy.argmax(y_torch, axis=1) == y1_test) - n_correct_qm = numpy.sum(numpy.argmax(y_qm, axis=1) == y1_test) - n_correct_glwe = numpy.sum(numpy.argmax(y_glwe, axis=1) == y1_test) + # Only test GLWE execution if input dimension is >= 512 + if should_use_glwe: + y_glwe = hybrid_local(x1_test, fhe="execute").numpy() - # These two should be exactly the same - assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001)) + y1_test = y1_test.numpy() + n_correct_fp32 = numpy.sum(numpy.argmax(y_torch, axis=1) == y1_test) + n_correct_qm = numpy.sum(numpy.argmax(y_qm, axis=1) == y1_test) + n_correct_glwe = numpy.sum(numpy.argmax(y_glwe, axis=1) == y1_test) - # The clear quantization vs fp32 test has more tolerance - threshold_fhe = 0.01 + # These two should be exactly the same + assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001)) - diff = numpy.abs(y_torch - y_glwe) > threshold_fhe - if numpy.any(diff): - print(f"Value discrepancy detected for GLWE backend, with epsilon={threshold_fhe}") - print("Model output (torch fp32)", y_torch[diff]) - print("Model output (glwe)", y_glwe[diff]) - print("Model output (quantized clear)", y_qm[diff]) + # The clear quantization vs fp32 test has more tolerance + threshold_fhe = 0.01 - assert numpy.all(numpy.allclose(y_qm, y_glwe, rtol=1, atol=threshold_fhe)) - assert numpy.all(numpy.allclose(y_torch, y_glwe, rtol=1, atol=threshold_fhe)) + diff = numpy.abs(y_torch - y_glwe) > threshold_fhe + if numpy.any(diff): + print(f"Value discrepancy detected for GLWE backend, with epsilon={threshold_fhe}") + print("Model output (torch fp32)", y_torch[diff]) + print("Model output (glwe)", y_glwe[diff]) + print("Model output (quantized clear)", y_qm[diff]) - n_correct_delta_threshold_fhe = 1 - # Check accuracy between fp32 and glwe - assert numpy.abs(n_correct_fp32 - n_correct_glwe) <= n_correct_delta_threshold_fhe + assert numpy.all(numpy.allclose(y_qm, y_glwe, rtol=1, atol=threshold_fhe)) + assert numpy.all(numpy.allclose(y_torch, y_glwe, rtol=1, atol=threshold_fhe)) - # Check accuracy between quantized and glwe - assert numpy.abs(n_correct_qm - n_correct_glwe) <= n_correct_delta_threshold_fhe + n_correct_delta_threshold_fhe = 1 + # Check accuracy between fp32 and glwe + assert numpy.abs(n_correct_fp32 - n_correct_glwe) <= n_correct_delta_threshold_fhe + + # Check accuracy between quantized and glwe + assert numpy.abs(n_correct_qm - n_correct_glwe) <= n_correct_delta_threshold_fhe + else: + # For non-GLWE cases, just verify the torch outputs match + assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001)) + assert numpy.all(numpy.allclose(y_qm, y_hybrid_torch, rtol=1, atol=0.01)) From 5f73bd6a99c7e96e7a94d3eefb126c81ea327674 Mon Sep 17 00:00:00 2001 From: jfrery Date: Fri, 15 Nov 2024 11:56:12 +0100 Subject: [PATCH 3/6] chore: only create keys and initialise executor once --- src/concrete/ml/torch/hybrid_model.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index 242065e48..da4e0ef03 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -456,7 +456,6 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: # Validate the FHE mode fhe_mode = HybridFHEMode(fhe) - self.executor = None if _HAS_GLWE_BACKEND and self._has_large_linear_layers: if fhe_mode == HybridFHEMode.SIMULATE: @@ -466,15 +465,12 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: ) if fhe_mode in (HybridFHEMode.EXECUTE, HybridFHEMode.REMOTE, HybridFHEMode.DISABLE): - # If all layers are pure linear, enable the GLWE optimization for all layers - # and generate an encryption and compression key for all layers - # as they share crypto-parameters - - # Loading keys from a file could be done here, and the - # keys could be passed as arguments to the Executor - self.executor = GLWELinearLayerExecutor() - if fhe_mode != HybridFHEMode.DISABLE: - self.executor.keygen() + # Initialize executor only if not already done + if self.executor is None: + self.executor = GLWELinearLayerExecutor() + # Generate keys only if needed and not already done + if fhe_mode != HybridFHEMode.DISABLE: + self.executor.keygen() # Update executor for all remote modules for module in self.remote_modules.values(): From 10138bec854e295910c5377d368028f2e50c9f33 Mon Sep 17 00:00:00 2001 From: jfrery Date: Fri, 15 Nov 2024 13:38:37 +0100 Subject: [PATCH 4/6] chore: fix tests and refresh LoraMLP notebook --- docs/advanced_examples/LoraMLP.ipynb | 88 +++++++++++++------------- src/concrete/ml/torch/hybrid_model.py | 17 ++--- tests/torch/test_hybrid_converter.py | 2 +- use_case_examples/deployment/README.md | 16 ++--- 4 files changed, 62 insertions(+), 61 deletions(-) diff --git a/docs/advanced_examples/LoraMLP.ipynb b/docs/advanced_examples/LoraMLP.ipynb index af17b90fc..7b6dc6e7c 100644 --- a/docs/advanced_examples/LoraMLP.ipynb +++ b/docs/advanced_examples/LoraMLP.ipynb @@ -21,7 +21,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -324,7 +324,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r\n", + "\r", "Training: 0%| | 0/10 [00:00 torch.Tensor: # Validate the FHE mode fhe_mode = HybridFHEMode(fhe) - if _HAS_GLWE_BACKEND and self._has_large_linear_layers: + if _HAS_GLWE_BACKEND and self._has_only_large_linear_layers: if fhe_mode == HybridFHEMode.SIMULATE: raise AssertionError( "When the HybridFHEModel is instantiated with only " @@ -468,9 +468,10 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: # Initialize executor only if not already done if self.executor is None: self.executor = GLWELinearLayerExecutor() - # Generate keys only if needed and not already done - if fhe_mode != HybridFHEMode.DISABLE: - self.executor.keygen() + + # Generate keys only if needed and not already done + if fhe_mode != HybridFHEMode.DISABLE and self.executor.private_key is None: + self.executor.keygen() # Update executor for all remote modules for module in self.remote_modules.values(): @@ -580,7 +581,7 @@ def compile_model( # If all layers are linear and the GLWE backend is available # then simply quantize the model without compiling with # Concrete Python. - if self._has_large_linear_layers and _HAS_GLWE_BACKEND: + if self._has_only_large_linear_layers and _HAS_GLWE_BACKEND: self.private_q_modules[name] = build_quantized_module( self.private_modules[name], calibration_data_tensor, diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py index 1f25ac499..de5724c6d 100644 --- a/tests/torch/test_hybrid_converter.py +++ b/tests/torch/test_hybrid_converter.py @@ -328,7 +328,7 @@ def prepare_data(x, y, test_size=0.1, random_state=42): # were linear and were replaced with the GLWE backend # Check if GLWE optimization should be used based on input dimension should_use_glwe = n_hidden >= 512 - is_pure_linear = hybrid_local._has_large_linear_layers # pylint: disable=protected-access + is_pure_linear = hybrid_local._has_only_large_linear_layers # pylint: disable=protected-access assert is_pure_linear == should_use_glwe hybrid_local.compile_model(x1_train, n_bits=10) diff --git a/use_case_examples/deployment/README.md b/use_case_examples/deployment/README.md index 38baac25b..fed7b713e 100644 --- a/use_case_examples/deployment/README.md +++ b/use_case_examples/deployment/README.md @@ -7,26 +7,26 @@ This folder contains examples of how to deploy Concrete ML models using Fully Ho The deployment process generally follows these steps: 1. Train the model (optional, depending on the use case) -2. Compile the model to an FHE circuit -3. Deploy the model using Docker -4. Run inference using a client (locally or in Docker) +1. Compile the model to an FHE circuit +1. Deploy the model using Docker +1. Run inference using a client (locally or in Docker) ## Available Examples We provide three different use cases to demonstrate the deployment process: 1. [Breast Cancer Classification](./breast_cancer/README.md) -2. [Sentiment Analysis](./sentiment_analysis/README.md) -3. [CIFAR-10 Image Classification](./cifar/README.md) +1. [Sentiment Analysis](./sentiment_analysis/README.md) +1. [CIFAR-10 Image Classification](./cifar/README.md) ## Getting Started Each example folder contains its own README with specific instructions. However, the general process is similar: 1. Train or compile the model using the provided scripts -2. Deploy the model using `deploy_to_docker.py` from the `server` folder -3. Build the client Docker image -4. Run the client to interact with the deployed model +1. Deploy the model using `deploy_to_docker.py` from the `server` folder +1. Build the client Docker image +1. Run the client to interact with the deployed model For detailed instructions, please refer to the README in each example folder. From c5e1e283f2a59728b3e663cb4c92283ac1aaee56 Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 19 Nov 2024 10:19:09 +0100 Subject: [PATCH 5/6] chore: no save circuit with glwe backend --- src/concrete/ml/torch/hybrid_model.py | 21 +++++++++++++++++++-- tests/torch/test_hybrid_converter.py | 18 +++++++++++++++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index 6af89f2d6..e8bbf6c34 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -267,6 +267,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: elif self.fhe_local_mode == HybridFHEMode.REMOTE: # pragma:no cover # Remote call + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4672 assert self.executor is None, "Remote optimized linear layers are not yet implemented" y = self.remote_call(x) @@ -410,8 +411,15 @@ def _replace_modules(self): # Optimal input dimension is 2048, below 512 the performance are too low. if is_pure_linear_layer: module = self.private_modules[module_name] - input_dim = module.in_features if hasattr(module, "in_features") else 0 - output_dim = module.out_features if hasattr(module, "out_features") else 0 + # Use weight shape instead of in/out_features + if hasattr(module, "weight"): + input_dim = module.weight.shape[ + 1 + ] # Input dimension is second dimension for Linear layers + output_dim = module.weight.shape[0] # Output dimension is first dimension + else: + input_dim = output_dim = 0 + is_pure_linear_layer = ( is_pure_linear_layer and input_dim >= 512 and output_dim >= 512 ) @@ -582,6 +590,7 @@ def compile_model( # then simply quantize the model without compiling with # Concrete Python. if self._has_only_large_linear_layers and _HAS_GLWE_BACKEND: + self.executor = GLWELinearLayerExecutor() self.private_q_modules[name] = build_quantized_module( self.private_modules[name], calibration_data_tensor, @@ -637,7 +646,15 @@ def save_and_clear_private_info(self, path: Path, via_mlir=True): path (Path): The directory where the model and the FHE circuit will be saved. via_mlir (bool): if fhe circuits should be serialized using via_mlir option useful for cross-platform (compile on one architecture and run on another) + + Raises: + NotImplementedError: GLWE backend deployment is not yet supported """ + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4672 + # GLWE backend deployment is not yet supported + if self.executor is not None: + raise NotImplementedError("GLWE backend deployment is not yet supported") + path = Path(path) path.mkdir(parents=True, exist_ok=True) diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py index de5724c6d..110d5d5b2 100644 --- a/tests/torch/test_hybrid_converter.py +++ b/tests/torch/test_hybrid_converter.py @@ -73,6 +73,7 @@ def run_hybrid_llm_test( # Create a hybrid model hybrid_model = HybridFHEModel(model, module_names) + is_compiled = False try: hybrid_model.compile_model( inputs, @@ -81,6 +82,7 @@ def run_hybrid_llm_test( rounding_threshold_bits=8, configuration=configuration, ) + is_compiled = True except RuntimeError as error: # When reshaping adds PBSs we sometimes encounter NoParametersFound # when compiling. In this case we skip the rest since we can't simulate @@ -153,10 +155,20 @@ def run_hybrid_llm_test( # Get the temp directory path if not has_pbs and glwe_backend_installed: - # Deployment of GLWE backend hybrid models is not yet supported - with pytest.raises(AttributeError, match="The quantized module is not compiled.*"): - hybrid_model.save_and_clear_private_info(temp_dir_path) + if is_compiled: + # Deployment of GLWE backend hybrid models is not yet supported + with pytest.raises( + NotImplementedError, match="GLWE backend deployment is not yet supported" + ): + hybrid_model.save_and_clear_private_info(temp_dir_path) + else: + # Check that we get an error when trying to save a non-compiled model + with pytest.raises( + AttributeError, + match="The quantized module is not compiled. Please run compile*", + ): + hybrid_model.save_and_clear_private_info(temp_dir_path) else: hybrid_model.save_and_clear_private_info(temp_dir_path) From c4a68e48cc83f099340b1d7b375cc694b1f057d6 Mon Sep 17 00:00:00 2001 From: jfrery Date: Wed, 20 Nov 2024 16:47:54 +0100 Subject: [PATCH 6/6] chore: fix patch --- tests/torch/test_hybrid_converter.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py index 110d5d5b2..67af03037 100644 --- a/tests/torch/test_hybrid_converter.py +++ b/tests/torch/test_hybrid_converter.py @@ -1,5 +1,6 @@ """Tests for the hybrid model converter.""" +import importlib import sys import tempfile from pathlib import Path @@ -64,14 +65,14 @@ def run_hybrid_llm_test( if has_pbs_reshape: has_pbs = True - # Propagate glwe_backend_installed state being tested to constants of affected modules - for affected_module in ( - concrete.ml.quantization.linear_op_glwe_backend, - concrete.ml.torch.hybrid_model, - ): - m.setattr(affected_module, "_HAS_GLWE_BACKEND", glwe_backend_installed) + # Patching for GLWE backend + if not glwe_backend_installed: + m.setitem(sys.modules, "concrete_ml_extensions", None) + + # Reload the affected modules to ensure the changes take effect + importlib.reload(concrete.ml.quantization.linear_op_glwe_backend) + importlib.reload(concrete.ml.torch.hybrid_model) - # Create a hybrid model hybrid_model = HybridFHEModel(model, module_names) is_compiled = False try: