From 43601e17287349b84eb247779a4b23e71335afbf Mon Sep 17 00:00:00 2001 From: Andrei Stoian Date: Wed, 18 Dec 2024 21:46:14 +0100 Subject: [PATCH] fix: full gpu hybrid model --- .../ml/quantization/linear_op_glwe_backend.py | 26 +++++++------- .../ml/quantization/quantized_module.py | 16 +++++---- src/concrete/ml/quantization/quantizers.py | 34 ++++++++++++++----- src/concrete/ml/torch/hybrid_model.py | 24 +++++++++---- 4 files changed, 65 insertions(+), 35 deletions(-) diff --git a/src/concrete/ml/quantization/linear_op_glwe_backend.py b/src/concrete/ml/quantization/linear_op_glwe_backend.py index 66604928b..98b2b4527 100644 --- a/src/concrete/ml/quantization/linear_op_glwe_backend.py +++ b/src/concrete/ml/quantization/linear_op_glwe_backend.py @@ -3,6 +3,7 @@ import json import numpy +import torch from ..common.utils import HybridFHEMode, to_tuple from .quantized_module import QuantizedModule @@ -55,8 +56,8 @@ def keygen(self): ) def forward( - self, x: numpy.ndarray, q_module: QuantizedModule, fhe: HybridFHEMode - ) -> numpy.ndarray: + self, x: torch.Tensor, q_module: QuantizedModule, fhe: HybridFHEMode + ) -> torch.Tensor: """Perform the inference of this linear layer. Args: @@ -91,23 +92,23 @@ def forward( assert weight_bias[0].quantizer.quant_params.zero_point == 0 # Retrieve quantized weights - q_weight = weight_bias[0].qvalues + q_weight = weight_bias[0].values + assert(isinstance(q_weight, numpy.ndarray)) + assert(q_weight.dtype == numpy.float32) q_weight = numpy.transpose(q_weight) if transpose_inputs2 else q_weight - q_x = q_module.quantize_input(x) - assert q_x is not None - assert isinstance(q_x, numpy.ndarray) - - q_x = numpy.transpose(q_x) if transpose_inputs1 else q_x + q_x = q_module.quantize_input(x, dtype=numpy.float32 if fhe == HybridFHEMode.DISABLE else None) + q_x = torch.transpose(q_x) if transpose_inputs1 else q_x if fhe == HybridFHEMode.DISABLE: # There is no need to add the bias to the de-quantized values # as the bias is already included in the output quantizer # zero-point, in the analytical calibration - q_x = q_x.astype(numpy.float32) - q_weight = q_weight.astype(numpy.float32) - y = q_module.dequantize_output(*to_tuple(numpy.matmul(q_x, q_weight))) + + q_w = torch.from_numpy(q_weight).to(q_x.device) + mm = torch.matmul(q_x, q_w) + y = q_module.dequantize_output(*to_tuple(mm)) else: # Need to slice the last GLWE (this will be improved in later cml-extensions) num_valid_glwe_values_in_last_ciphertext = ( @@ -162,7 +163,6 @@ def forward( if return_2d: y = numpy.squeeze(y) - # Only single outputs are supported - assert isinstance(y, numpy.ndarray) + y = y.astype(numpy.float32) return y diff --git a/src/concrete/ml/quantization/quantized_module.py b/src/concrete/ml/quantization/quantized_module.py index 7761d7bbe..a14996d5a 100644 --- a/src/concrete/ml/quantization/quantized_module.py +++ b/src/concrete/ml/quantization/quantized_module.py @@ -8,6 +8,7 @@ import numpy import onnx +import torch from concrete.fhe.compilation.artifacts import DebugArtifacts from concrete.fhe.compilation.circuit import Circuit from concrete.fhe.compilation.compiler import Compiler @@ -702,7 +703,7 @@ def _fhe_forward( return q_results def quantize_input( - self, *x: Optional[numpy.ndarray] + self, *x: Optional[Union[numpy.ndarray, torch.Tensor]], dtype=numpy.int64 ) -> Union[numpy.ndarray, Tuple[Optional[numpy.ndarray], ...]]: """Take the inputs in fp32 and quantize it using the learned quantization parameters. @@ -729,7 +730,7 @@ def quantize_input( # cannot be None q_x = tuple( ( - self.input_quantizers[idx].quant(x[idx]) # type: ignore[arg-type] + self.input_quantizers[idx].quant(x[idx], dtype) # type: ignore[arg-type] if x[idx] is not None else None ) @@ -738,7 +739,7 @@ def quantize_input( # Make sure all inputs are quantized to int64 assert all_values_are_of_dtype( - *q_x, dtypes="int64", allow_none=True + *q_x, dtypes=numpy.dtype(dtype).name, allow_none=True ), "Inputs were not quantized to int64" if len(q_x) == 1: @@ -749,8 +750,8 @@ def quantize_input( return q_x def dequantize_output( - self, *q_y_preds: numpy.ndarray - ) -> Union[numpy.ndarray, Tuple[numpy.ndarray, ...]]: + self, *q_y_preds: Union[numpy.ndarray, torch.Tensor] + ) -> Union[Union[numpy.ndarray, torch.Tensor], Tuple[Union[numpy.ndarray, torch.Tensor], ...]]: """Take the last layer q_out and use its de-quant function. Args: @@ -767,10 +768,13 @@ def dequantize_output( ) y_preds = tuple( - numpy.array(output_quantizer.dequant(q_y_pred)) + output_quantizer.dequant(q_y_pred) for q_y_pred, output_quantizer in zip(q_y_preds, self.output_quantizers) ) + if not isinstance(q_y_preds[0], torch.Tensor): + y_preds = tuple(map(numpy.array, y_preds)) + if len(y_preds) == 1: return y_preds[0] diff --git a/src/concrete/ml/quantization/quantizers.py b/src/concrete/ml/quantization/quantizers.py index 8e65b54d3..c42706abc 100644 --- a/src/concrete/ml/quantization/quantizers.py +++ b/src/concrete/ml/quantization/quantizers.py @@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, TextIO, Union, get_type_hints import numpy +import torch from concrete.fhe.tracing.tracer import Tracer from ..common.debugging import assert_true @@ -671,7 +672,7 @@ def dump(self, file: TextIO) -> None: """ dump(self, file) - def quant(self, values: numpy.ndarray) -> numpy.ndarray: + def quant(self, values: Union[numpy.ndarray, torch.Tensor], dtype=numpy.int64) -> numpy.ndarray: """Quantize values. Args: @@ -686,10 +687,17 @@ def quant(self, values: numpy.ndarray) -> numpy.ndarray: assert self.offset is not None assert self.scale is not None - if QUANT_ROUND_LIKE_ROUND_PBS: - qvalues = numpy.floor(values / self.scale + self.zero_point + 0.5) # pragma: no cover + assert dtype in (numpy.int64, numpy.int32, numpy.float32, numpy.float64) + + delta = 0.5 if QUANT_ROUND_LIKE_ROUND_PBS else 0 + if isinstance(values, numpy.ndarray): + round_func = numpy.floor if QUANT_ROUND_LIKE_ROUND_PBS else numpy.rint + clip_func = numpy.clip else: - qvalues = numpy.rint(values / self.scale + self.zero_point) + round_func = torch.floor if QUANT_ROUND_LIKE_ROUND_PBS else torch.round + clip_func = torch.clip + + qvalues = round_func(values / self.scale + self.zero_point + delta) # Clipping must be performed for PTQ and for precomputed (for now only Brevitas) QAT # (where quantizer parameters are available in ONNX layers). @@ -705,11 +713,15 @@ def quant(self, values: numpy.ndarray) -> numpy.ndarray: if self.is_narrow: min_value += 1 - qvalues = qvalues.clip(min_value, 2 ** (self.n_bits) - 1 - self.offset) + qvalues = clip_func(qvalues, min_value, 2 ** (self.n_bits) - 1 - self.offset) - return qvalues.astype(numpy.int64) + # Only cast for numpy usage for Concrete circuits + if isinstance(values, numpy.ndarray): + qvalues = qvalues.astype(dtype) - def dequant(self, qvalues: numpy.ndarray) -> Union[float, numpy.ndarray, Tracer]: + return qvalues + + def dequant(self, qvalues: Union[numpy.ndarray, torch.Tensor]) -> Union[float, numpy.ndarray, torch.Tensor, Tracer]: """De-quantize values. Args: @@ -731,9 +743,13 @@ def dequant(self, qvalues: numpy.ndarray) -> Union[float, numpy.ndarray, Tracer] + ((" " + str(self.scale.dtype)) if isinstance(self.scale, numpy.ndarray) else ""), ) - values = self.scale * (qvalues - numpy.asarray(self.zero_point, dtype=numpy.float64)) + prepared_zp = numpy.asarray(self.zero_point, dtype=numpy.float64) + if isinstance(qvalues, torch.Tensor): + prepared_zp = torch.from_numpy(prepared_zp).float().to(qvalues.device) + + values = self.scale * (qvalues - prepared_zp) - assert isinstance(values, (float, numpy.ndarray, Tracer)), f"{values=}, {type(values)=}" + assert isinstance(values, (float, numpy.ndarray, torch.Tensor, Tracer)), f"{values=}, {type(values)=}" return values diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index 5aa58e5a0..aa21490db 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -247,16 +247,15 @@ def forward(self, x: torch.Tensor) -> Union[torch.Tensor, QuantTensor]: if self.executor: # Delegate to the optimized GLWE executor - y = torch.Tensor( - self.executor.forward( - x.detach().numpy(), self.private_q_module, self.fhe_local_mode + y = self.executor.forward( + x.detach(), self.private_q_module, self.fhe_local_mode ) - ) else: + device = x.device # Delegate to the quantized module for all fhe modes y = torch.Tensor( - self.private_q_module.forward(x.detach().numpy(), fhe=self.fhe_local_mode.value) - ) + self.private_q_module.forward(x.cpu().detach().numpy(), fhe=self.fhe_local_mode.value) + ).to(device) elif self.fhe_local_mode == HybridFHEMode.CALIBRATE: # Calling torch + gathering calibration data @@ -568,7 +567,9 @@ def compile_model( self.configuration = configuration - for name in self.module_names: + from tqdm import tqdm + + for name in tqdm(self.module_names): remote_module = self._get_module_by_name(self.model, name) assert isinstance(remote_module, RemoteModule) @@ -596,6 +597,13 @@ def compile_model( n_bits=n_bits, rounding_threshold_bits=rounding_threshold_bits, ) + + vals = self.private_q_modules[name].quant_layers_dict.values() + _, q_op = next(iter(vals)) + const_inp = q_op.constant_inputs[1] # Get the weights, the bias is in [2] + const_inp.values = const_inp.qvalues.astype(numpy.float32) + + self.private_q_modules[name]._onnx_model = None else: self.private_q_modules[name] = compile_torch_model( self.private_modules[name], @@ -608,6 +616,8 @@ def compile_model( self.remote_modules[name].private_q_module = self.private_q_modules[name] + remote_module.calibration_data = None + def _save_fhe_circuit(self, path: Path, via_mlir=False): """Private method that saves the FHE circuits.