From 9b8abadc2691f67ea2cd65a69c3285e82d849e25 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 13 Jun 2024 16:23:37 +0000 Subject: [PATCH] Improve memory usage by properly cleaning up weights as quantized --- auto_fp8/quantize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py index deb53f3..85fd491 100644 --- a/auto_fp8/quantize.py +++ b/auto_fp8/quantize.py @@ -202,11 +202,14 @@ def quantize_weights( or name in quantize_config.ignored_layers ): continue - quant_weight, quant_scale = per_tensor_quantize(linear.weight) - quant_linear = FP8DynamicLinear(quant_weight, quant_scale, linear.bias) + quant_weight, quant_scale = per_tensor_quantize(linear.weight.clone()) + bias = linear.bias.clone() if linear.bias is not None else None + quant_linear = FP8DynamicLinear(quant_weight, quant_scale, bias) replace_module(model, name, quant_linear) + del linear.weight + del linear.bias del linear - cleanup_memory() + cleanup_memory() def quantize_activations(