Improve memory usage by properly cleaning up weights as quantized

neuralmagic · Jun 13, 2024 · 9b8abad · 9b8abad
1 parent ffea17e
commit 9b8abad
Showing 1 changed file with 6 additions and 3 deletions.
diff --git a/auto_fp8/quantize.py b/auto_fp8/quantize.py
@@ -202,11 +202,14 @@ def quantize_weights(
             or name in quantize_config.ignored_layers
         ):
             continue
-        quant_weight, quant_scale = per_tensor_quantize(linear.weight)
-        quant_linear = FP8DynamicLinear(quant_weight, quant_scale, linear.bias)
+        quant_weight, quant_scale = per_tensor_quantize(linear.weight.clone())
+        bias = linear.bias.clone() if linear.bias is not None else None
+        quant_linear = FP8DynamicLinear(quant_weight, quant_scale, bias)
         replace_module(model, name, quant_linear)
+        del linear.weight
+        del linear.bias
         del linear
-        cleanup_memory()
+    cleanup_memory()
 
 
 def quantize_activations(