Skip to content

Commit

Permalink
Merge pull request #16 from neuralmagic/improve-memory-usage
Browse files Browse the repository at this point in the history
Improve memory usage by properly cleaning up weights as quantized
  • Loading branch information
mgoin authored Jun 13, 2024
2 parents ffea17e + 9b8abad commit 2e134d8
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions auto_fp8/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,14 @@ def quantize_weights(
or name in quantize_config.ignored_layers
):
continue
quant_weight, quant_scale = per_tensor_quantize(linear.weight)
quant_linear = FP8DynamicLinear(quant_weight, quant_scale, linear.bias)
quant_weight, quant_scale = per_tensor_quantize(linear.weight.clone())
bias = linear.bias.clone() if linear.bias is not None else None
quant_linear = FP8DynamicLinear(quant_weight, quant_scale, bias)
replace_module(model, name, quant_linear)
del linear.weight
del linear.bias
del linear
cleanup_memory()
cleanup_memory()


def quantize_activations(
Expand Down

0 comments on commit 2e134d8

Please sign in to comment.