diff --git a/.gitleaksignore b/.gitleaksignore index 1356df088..a1c87598d 100644 --- a/.gitleaksignore +++ b/.gitleaksignore @@ -10,3 +10,4 @@ f41de03048a9ed27946b875e81b34138bb4bb17b:use_case_examples/training/analyze.ipyn e2904473898ddd325f245f4faca526a0e9520f49:builders/Dockerfile.zamalang-env:generic-api-key:5 7d5e885816f1f1e432dd94da38c5c8267292056a:docs/advanced_examples/XGBRegressor.ipynb:aws-access-token:1026 25c5e7abaa7382520af3fb7a64266e193b1f6a59:poetry.lock:square-access-token:6401 +eebd4bea78f6dd2361baa7f94f68ae4cba8b9fe8:tests/deployment/test_deployment.py:generic-api-key:20 \ No newline at end of file diff --git a/docs/advanced_examples/LoraMLP.ipynb b/docs/advanced_examples/LoraMLP.ipynb index af17b90fc..7b6dc6e7c 100644 --- a/docs/advanced_examples/LoraMLP.ipynb +++ b/docs/advanced_examples/LoraMLP.ipynb @@ -21,7 +21,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -324,7 +324,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r\n", + "\r", "Training: 0%| | 0/10 [00:00 Union[torch.Tensor, QuantTensor]: }: assert self.private_q_module is not None - try: - optimized_linear_layer_executor = _optimized_linear_executor.get() - except LookupError: - optimized_linear_layer_executor = None - - if optimized_linear_layer_executor: + if self.executor: # Delegate to the optimized GLWE executor y = torch.Tensor( - optimized_linear_layer_executor.forward( + self.executor.forward( x.detach().numpy(), self.private_q_module, self.fhe_local_mode ) ) @@ -269,6 +257,7 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: y = torch.Tensor( self.private_q_module.forward(x.detach().numpy(), fhe=self.fhe_local_mode.value) ) + elif self.fhe_local_mode == HybridFHEMode.CALIBRATE: # Calling torch + gathering calibration data assert self.private_module is not None @@ -278,14 +267,8 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: elif self.fhe_local_mode == HybridFHEMode.REMOTE: # pragma:no cover # Remote call - try: - optimized_linear_layer_executor = _optimized_linear_executor.get() - except LookupError: - optimized_linear_layer_executor = None - - assert optimized_linear_layer_executor is None, ( - "Remote optimized linear layers " "are not yet implemented" - ) + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4672 + assert self.executor is None, "Remote optimized linear layers are not yet implemented" y = self.remote_call(x) elif self.fhe_local_mode == HybridFHEMode.TORCH: @@ -400,13 +383,14 @@ def __init__( self.configuration: Optional[Configuration] = None self.model_name = model_name self.verbose = verbose + self.executor: Optional[GLWELinearLayerExecutor] = None self._replace_modules() def _replace_modules(self): """Replace the private modules in the model with remote layers.""" - self._all_layers_are_pure_linear = True + self._has_only_large_linear_layers = True for module_name in self.module_names: # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858 # Conv1d introduce reshaping operations which adds more TLU @@ -421,8 +405,27 @@ def _replace_modules(self): self.private_modules[module_name], (nn.Linear, ForwardModuleLinear, BackwardModuleLinear), ) + + # Check input dimensions for linear layers + # If the input dimension is less than 512 we do not use the GLWE optimization. + # Optimal input dimension is 2048, below 512 the performance are too low. + if is_pure_linear_layer: + module = self.private_modules[module_name] + # Use weight shape instead of in/out_features + if hasattr(module, "weight"): + input_dim = module.weight.shape[ + 1 + ] # Input dimension is second dimension for Linear layers + output_dim = module.weight.shape[0] # Output dimension is first dimension + else: + input_dim = output_dim = 0 + + is_pure_linear_layer = ( + is_pure_linear_layer and input_dim >= 512 and output_dim >= 512 + ) + if not is_pure_linear_layer: - self._all_layers_are_pure_linear = False + self._has_only_large_linear_layers = False for module_name in self.module_names: # Create the optimized glwe linear layer executor if needed @@ -432,7 +435,7 @@ def _replace_modules(self): module_name=module_name, model_name=self.model_name, verbose=self.verbose, - optimized_linear_execution=self._all_layers_are_pure_linear, + optimized_linear_execution=(self._has_only_large_linear_layers), ) self.remote_modules[module_name] = remote_module @@ -462,7 +465,7 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: # Validate the FHE mode fhe_mode = HybridFHEMode(fhe) - if _HAS_GLWE_BACKEND and self._all_layers_are_pure_linear: + if _HAS_GLWE_BACKEND and self._has_only_large_linear_layers: if fhe_mode == HybridFHEMode.SIMULATE: raise AssertionError( "When the HybridFHEModel is instantiated with only " @@ -470,23 +473,20 @@ def forward(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: ) if fhe_mode in (HybridFHEMode.EXECUTE, HybridFHEMode.REMOTE, HybridFHEMode.DISABLE): - # If all layers are pure linear, enable the GLWE optimization for all layers - # and generate an encryption and compression key for all layers - # as they share crypto-parameters + # Initialize executor only if not already done + if self.executor is None: + self.executor = GLWELinearLayerExecutor() - # Loading keys from a file could be done here, and the - # keys could be passed as arguments to the Executor - executor = GLWELinearLayerExecutor() + # Generate keys only if needed and not already done + if fhe_mode != HybridFHEMode.DISABLE and self.executor.private_key is None: + self.executor.keygen() - if fhe_mode != HybridFHEMode.DISABLE: - executor.keygen() - - _optimized_linear_executor.set(executor) + # Update executor for all remote modules + for module in self.remote_modules.values(): + module.executor = self.executor result = self.model(x) - _optimized_linear_executor.set(None) - return result def __call__(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: @@ -589,7 +589,8 @@ def compile_model( # If all layers are linear and the GLWE backend is available # then simply quantize the model without compiling with # Concrete Python. - if self._all_layers_are_pure_linear and _HAS_GLWE_BACKEND: + if self._has_only_large_linear_layers and _HAS_GLWE_BACKEND: + self.executor = GLWELinearLayerExecutor() self.private_q_modules[name] = build_quantized_module( self.private_modules[name], calibration_data_tensor, @@ -645,7 +646,15 @@ def save_and_clear_private_info(self, path: Path, via_mlir=True): path (Path): The directory where the model and the FHE circuit will be saved. via_mlir (bool): if fhe circuits should be serialized using via_mlir option useful for cross-platform (compile on one architecture and run on another) + + Raises: + NotImplementedError: GLWE backend deployment is not yet supported """ + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4672 + # GLWE backend deployment is not yet supported + if self.executor is not None: + raise NotImplementedError("GLWE backend deployment is not yet supported") + path = Path(path) path.mkdir(parents=True, exist_ok=True) diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py index 821903bcf..67af03037 100644 --- a/tests/torch/test_hybrid_converter.py +++ b/tests/torch/test_hybrid_converter.py @@ -1,5 +1,6 @@ """Tests for the hybrid model converter.""" +import importlib import sys import tempfile from pathlib import Path @@ -9,7 +10,6 @@ import pytest import torch from concrete.fhe import Configuration -from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from transformers import GPT2LMHeadModel, GPT2Tokenizer @@ -65,15 +65,16 @@ def run_hybrid_llm_test( if has_pbs_reshape: has_pbs = True - # Propagate glwe_backend_installed state being tested to constants of affected modules - for affected_module in ( - concrete.ml.quantization.linear_op_glwe_backend, - concrete.ml.torch.hybrid_model, - ): - m.setattr(affected_module, "_HAS_GLWE_BACKEND", glwe_backend_installed) + # Patching for GLWE backend + if not glwe_backend_installed: + m.setitem(sys.modules, "concrete_ml_extensions", None) + + # Reload the affected modules to ensure the changes take effect + importlib.reload(concrete.ml.quantization.linear_op_glwe_backend) + importlib.reload(concrete.ml.torch.hybrid_model) - # Create a hybrid model hybrid_model = HybridFHEModel(model, module_names) + is_compiled = False try: hybrid_model.compile_model( inputs, @@ -82,6 +83,7 @@ def run_hybrid_llm_test( rounding_threshold_bits=8, configuration=configuration, ) + is_compiled = True except RuntimeError as error: # When reshaping adds PBSs we sometimes encounter NoParametersFound # when compiling. In this case we skip the rest since we can't simulate @@ -154,10 +156,20 @@ def run_hybrid_llm_test( # Get the temp directory path if not has_pbs and glwe_backend_installed: - # Deployment of GLWE backend hybrid models is not yet supported - with pytest.raises(AttributeError, match="The quantized module is not compiled.*"): - hybrid_model.save_and_clear_private_info(temp_dir_path) + if is_compiled: + # Deployment of GLWE backend hybrid models is not yet supported + with pytest.raises( + NotImplementedError, match="GLWE backend deployment is not yet supported" + ): + hybrid_model.save_and_clear_private_info(temp_dir_path) + else: + # Check that we get an error when trying to save a non-compiled model + with pytest.raises( + AttributeError, + match="The quantized module is not compiled. Please run compile*", + ): + hybrid_model.save_and_clear_private_info(temp_dir_path) else: hybrid_model.save_and_clear_private_info(temp_dir_path) @@ -279,7 +291,7 @@ def test_invalid_model(): HybridFHEModel(invalid_model, module_names="sub_module") -@pytest.mark.parametrize("n_hidden", [512, 2048]) +@pytest.mark.parametrize("n_hidden", [256, 512, 2048]) def test_hybrid_glwe_correctness(n_hidden): """Tests that the GLWE backend produces correct results for the hybrid model.""" @@ -295,13 +307,15 @@ def prepare_data(x, y, test_size=0.1, random_state=42): y_test = torch.tensor(y_test, dtype=torch.long) return x_train, x_test, y_train, y_test - # Generate synthetic 2D data - x1_data, y1_data = make_moons(n_samples=num_samples, noise=0.2, random_state=42) + # Generate random data with n_hidden features and n_hidden classes + # keeping input and output dimensions equal to n_hidden. + x1_data = numpy.random.randn(num_samples, n_hidden) + y1_data = numpy.random.randint(0, n_hidden, size=num_samples) # n_hidden classes # Prepare data x1_train, x1_test, y1_train, y1_test = prepare_data(x1_data, y1_data) - model = FCSmall(2, torch.nn.ReLU, hidden=n_hidden) + model = FCSmall(n_hidden, torch.nn.ReLU, hidden=n_hidden) optimizer = torch.optim.Adam(model.parameters()) num_epochs = 100 @@ -325,38 +339,48 @@ def prepare_data(x, y, test_size=0.1, random_state=42): # This internal flag tells us whether all the layers # were linear and were replaced with the GLWE backend - assert hybrid_local._all_layers_are_pure_linear # pylint: disable=protected-access + # Check if GLWE optimization should be used based on input dimension + should_use_glwe = n_hidden >= 512 + is_pure_linear = hybrid_local._has_only_large_linear_layers # pylint: disable=protected-access + assert is_pure_linear == should_use_glwe hybrid_local.compile_model(x1_train, n_bits=10) y_qm = hybrid_local(x1_test, fhe="disable").numpy() y_hybrid_torch = hybrid_local(x1_test, fhe="torch").detach().numpy() - y_glwe = hybrid_local(x1_test, fhe="execute").numpy() - y1_test = y1_test.numpy() - n_correct_fp32 = numpy.sum(numpy.argmax(y_torch, axis=1) == y1_test) - n_correct_qm = numpy.sum(numpy.argmax(y_qm, axis=1) == y1_test) - n_correct_glwe = numpy.sum(numpy.argmax(y_glwe, axis=1) == y1_test) + # Only test GLWE execution if input dimension is >= 512 + if should_use_glwe: + y_glwe = hybrid_local(x1_test, fhe="execute").numpy() + + y1_test = y1_test.numpy() + n_correct_fp32 = numpy.sum(numpy.argmax(y_torch, axis=1) == y1_test) + n_correct_qm = numpy.sum(numpy.argmax(y_qm, axis=1) == y1_test) + n_correct_glwe = numpy.sum(numpy.argmax(y_glwe, axis=1) == y1_test) - # These two should be exactly the same - assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001)) + # These two should be exactly the same + assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001)) - # The clear quantization vs fp32 test has more tolerance - threshold_fhe = 0.01 + # The clear quantization vs fp32 test has more tolerance + threshold_fhe = 0.01 - diff = numpy.abs(y_torch - y_glwe) > threshold_fhe - if numpy.any(diff): - print(f"Value discrepancy detected for GLWE backend, with epsilon={threshold_fhe}") - print("Model output (torch fp32)", y_torch[diff]) - print("Model output (glwe)", y_glwe[diff]) - print("Model output (quantized clear)", y_qm[diff]) + diff = numpy.abs(y_torch - y_glwe) > threshold_fhe + if numpy.any(diff): + print(f"Value discrepancy detected for GLWE backend, with epsilon={threshold_fhe}") + print("Model output (torch fp32)", y_torch[diff]) + print("Model output (glwe)", y_glwe[diff]) + print("Model output (quantized clear)", y_qm[diff]) - assert numpy.all(numpy.allclose(y_qm, y_glwe, rtol=1, atol=threshold_fhe)) - assert numpy.all(numpy.allclose(y_torch, y_glwe, rtol=1, atol=threshold_fhe)) + assert numpy.all(numpy.allclose(y_qm, y_glwe, rtol=1, atol=threshold_fhe)) + assert numpy.all(numpy.allclose(y_torch, y_glwe, rtol=1, atol=threshold_fhe)) - n_correct_delta_threshold_fhe = 1 - # Check accuracy between fp32 and glwe - assert numpy.abs(n_correct_fp32 - n_correct_glwe) <= n_correct_delta_threshold_fhe + n_correct_delta_threshold_fhe = 1 + # Check accuracy between fp32 and glwe + assert numpy.abs(n_correct_fp32 - n_correct_glwe) <= n_correct_delta_threshold_fhe - # Check accuracy between quantized and glwe - assert numpy.abs(n_correct_qm - n_correct_glwe) <= n_correct_delta_threshold_fhe + # Check accuracy between quantized and glwe + assert numpy.abs(n_correct_qm - n_correct_glwe) <= n_correct_delta_threshold_fhe + else: + # For non-GLWE cases, just verify the torch outputs match + assert numpy.all(numpy.allclose(y_torch, y_hybrid_torch, rtol=1, atol=0.001)) + assert numpy.all(numpy.allclose(y_qm, y_hybrid_torch, rtol=1, atol=0.01)) diff --git a/use_case_examples/deployment/README.md b/use_case_examples/deployment/README.md index 38baac25b..fed7b713e 100644 --- a/use_case_examples/deployment/README.md +++ b/use_case_examples/deployment/README.md @@ -7,26 +7,26 @@ This folder contains examples of how to deploy Concrete ML models using Fully Ho The deployment process generally follows these steps: 1. Train the model (optional, depending on the use case) -2. Compile the model to an FHE circuit -3. Deploy the model using Docker -4. Run inference using a client (locally or in Docker) +1. Compile the model to an FHE circuit +1. Deploy the model using Docker +1. Run inference using a client (locally or in Docker) ## Available Examples We provide three different use cases to demonstrate the deployment process: 1. [Breast Cancer Classification](./breast_cancer/README.md) -2. [Sentiment Analysis](./sentiment_analysis/README.md) -3. [CIFAR-10 Image Classification](./cifar/README.md) +1. [Sentiment Analysis](./sentiment_analysis/README.md) +1. [CIFAR-10 Image Classification](./cifar/README.md) ## Getting Started Each example folder contains its own README with specific instructions. However, the general process is similar: 1. Train or compile the model using the provided scripts -2. Deploy the model using `deploy_to_docker.py` from the `server` folder -3. Build the client Docker image -4. Run the client to interact with the deployed model +1. Deploy the model using `deploy_to_docker.py` from the `server` folder +1. Build the client Docker image +1. Run the client to interact with the deployed model For detailed instructions, please refer to the README in each example folder.