diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..c94dc841 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +build: + CUDA_VISIBLE_DEVICES=-1 pip install -e . -vvv + +.PHONY: build \ No newline at end of file diff --git a/awq/modules/linear/gemm.py b/awq/modules/linear/gemm.py index 7ee89cc8..3a376336 100644 --- a/awq/modules/linear/gemm.py +++ b/awq/modules/linear/gemm.py @@ -5,6 +5,8 @@ from awq.utils.module import try_import from awq.utils.utils import get_best_device from awq.utils.packing_utils import dequantize_gemm +import logging +logger = logging.getLogger(__name__) # NOTE: We check if awq_ext or triton is available. awq_ext will be preferred if both are installed. @@ -199,6 +201,7 @@ def from_linear( / awq_linear.scales[idx // group_size] ).to(torch.int)[:, None] ) + logger.warning("Got int weight...") intweight = torch.cat(intweight, dim=1) intweight = intweight.t().contiguous() intweight = intweight.to(dtype=torch.int32) @@ -225,7 +228,7 @@ def from_linear( qweight[:, col] |= qweight_col << (i * awq_linear.w_bit) awq_linear.qweight = qweight - zeros = zeros.to(dtype=torch.int32, device=best_device) + zeros = zeros.to(dtype=torch.int32, device="cpu") if "mps" in best_device: zeros = zeros.to("cpu") @@ -235,7 +238,7 @@ def from_linear( dtype=torch.int32, device=zeros.device, ) - + logger.warning("PACK Qzeros...") for col in range(zeros.shape[1] // pack_num): if awq_linear.w_bit == 4: order_map = [0, 2, 4, 6, 1, 3, 5, 7] @@ -244,7 +247,9 @@ def from_linear( for i in range(pack_num): qzero_col = zeros[:, col * pack_num + order_map[i]] qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit) + logger.warning("PACK Qzeros done...") awq_linear.qzeros = qzeros + awq_linear = awq_linear.to(best_device) return awq_linear diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py index cd9fb0dd..9dbdbf35 100644 --- a/awq/quantize/quantizer.py +++ b/awq/quantize/quantizer.py @@ -22,7 +22,9 @@ set_op_by_name, exclude_layers_to_not_quantize, ) - +import logging +logger = logging.getLogger(__name__) +import habana_frameworks.torch.core as htcore class AwqQuantizer: def __init__( @@ -70,13 +72,17 @@ def __init__( n_samples=self.max_calib_samples, max_seq_len=self.max_calib_seq_len ) - def pseudo_quantize_tensor(self, w: torch.Tensor): + def pseudo_quantize_tensor(self, w: torch.Tensor, return_int=False): org_w_shape = w.shape if self.group_size > 0: assert org_w_shape[-1] % self.group_size == 0 w = w.reshape(-1, self.group_size) + if torch.isnan(w).sum() > 0: + breakpoint() + logging.error(f"Found {torch.isnan(w).sum()} NaNs in weight matrix") assert w.dim() == 2 assert torch.isnan(w).sum() == 0 + # breakpoint() # zero point quantization if self.zero_point: @@ -86,9 +92,8 @@ def pseudo_quantize_tensor(self, w: torch.Tensor): min_int = 0 scales = (max_val - min_val).clamp(min=1e-5) / max_int zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int) - w = ( - torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros - ) * scales + w_int = torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros + w = w_int * scales zeros = zeros.view(org_w_shape[0], -1) else: max_val = w.abs().amax(dim=1, keepdim=True) @@ -97,14 +102,19 @@ def pseudo_quantize_tensor(self, w: torch.Tensor): min_int = -(2 ** (self.w_bit - 1)) scales = max_val / max_int zeros = None - w = torch.clamp(torch.round(w / scales), min_int, max_int) * scales - + w_int = torch.clamp(torch.round(w / scales), min_int, max_int) + w = w_int * scales + if torch.isnan(w).sum() > 0: + breakpoint() + logging.error(f"Found {torch.isnan(w).sum()} NaNs in weight matrix {w.shape}") assert torch.isnan(scales).sum() == 0 assert torch.isnan(w).sum() == 0 scales = scales.view(org_w_shape[0], -1) w = w.reshape(org_w_shape) - + + if return_int: + return w, scales, zeros, w_int.reshape(org_w_shape) return w, scales, zeros def pseudo_dequantize_tensor( @@ -124,7 +134,10 @@ def pseudo_dequantize_tensor( return w def quantize(self): + self._num_modules = len(self.modules) for i in tqdm(range(len(self.modules)), desc="AWQ"): + # if i > 1: + # return # Move module and inputs to correct device common_device = next(self.modules[i].parameters()).device if common_device is None or str(common_device) == "cpu": @@ -171,6 +184,7 @@ def quantize(self): scales_list = append_str_prefix( scales_list, get_op_name(self.model, self.modules[i]) + "." ) + logger.warning(f"Applied scales: {scales_list}") # [STEP 3]: Compute and apply clipping list if self.apply_clip: @@ -199,43 +213,61 @@ def pack(self): def _apply_quant(self, module, named_linears: Dict[str, nn.Linear]): for name, linear_layer in named_linears.items(): - # NOTE: small regression in perplexity if linear layer uses .cpu().float() - linear_layer = linear_layer.to(get_best_device()).half() - - linear_layer.weight.data, scales, zeros = self.pseudo_quantize_tensor( - linear_layer.weight.data - ) - - if self.version == "gemm": - scales = scales.t().contiguous() - if zeros is not None: - zeros = zeros.t().contiguous() - q_linear_module = WQLinear_GEMM - - elif self.version == "gemv": - q_linear_module = WQLinear_GEMV - - elif self.version == "marlin": - q_linear_module = WQLinear_Marlin - - elif self.version == "gemv_fast": - q_linear_module = WQLinear_GEMVFast - - else: - raise ValueError(f"Unknown version {self.version}") - - q_linear = q_linear_module.from_linear( - linear=linear_layer, - w_bit=self.w_bit, - group_size=self.group_size, - init_only=False, - scales=scales, - zeros=zeros, - ) - - linear_layer.cpu() - q_linear.to(next(module.parameters()).device) + logger.warning(f"Quantizing {name}") + # linear_layer = linear_layer.cpu().half() + # # NOTE: small regression in perplexity if linear layer uses .cpu().float() + # # linear_layer = linear_layer.to(get_best_device()).half() + + # linear_layer.weight.data, scales, zeros = self.pseudo_quantize_tensor( + # linear_layer.weight.data + # ) + + # if self.version == "gemm": + # scales = scales.t().contiguous() + # if zeros is not None: + # zeros = zeros.t().contiguous() + # q_linear_module = WQLinear_GEMM + + # elif self.version == "gemv": + # q_linear_module = WQLinear_GEMV + + # elif self.version == "marlin": + # q_linear_module = WQLinear_Marlin + + # elif self.version == "gemv_fast": + # q_linear_module = WQLinear_GEMVFast + + # else: + # raise ValueError(f"Unknown version {self.version}") + # linear_layer = linear_layer.cpu() + # from neural_compressor.torch.algorithms.weight_only.rtn import RTNQuantizer + # from neural_compressor.torch.quantization.config import RTNConfig + # config = RTNConfig(group_size=self.group_size, bits=self.w_bit) + # config_dict = config.to_dict() + # config_dict["scheme"] = "sym" # ? + # rtn_quantizer = RTNQuantizer(quant_config={'': config_dict}) + # q_linear = rtn_quantizer.quantize(linear_layer) + # # breakpoint() + # # breakpoint() + + # # q_linear = linear_layer + + # # q_linear = q_linear_module.from_linear( + # # linear=linear_layer, + # # w_bit=self.w_bit, + # # group_size=self.group_size, + # # init_only=False, + # # scales=scales, + # # zeros=zeros, + # # ) + # logger.warning(f"got q_linear {q_linear}") + + # linear_layer.cpu() + # q_linear.to(next(module.parameters()).device) + q_linear = linear_layer set_op_by_name(module, name, q_linear) + # set_op_by_name(module, name, q_linear) + logger.warning(f"update {name} to {q_linear}") clear_memory() @torch.no_grad() @@ -325,11 +357,13 @@ def _search_best_scale( with torch.no_grad(): module_kwargs = self._sanitize_kwargs(kwargs, module2inspect) fp16_output = self._module_forward(inp, module2inspect, module_kwargs) + htcore.mark_step() # [STEP 4]: Compute loss best_scales = self._compute_best_scale( inp, w_mean, x_mean, module2inspect, layers, fp16_output, module_kwargs ) + htcore.mark_step() return ( get_op_name(module, prev_op), @@ -367,7 +401,8 @@ def _compute_best_scale( device = x.device x_mean = x_mean.view(-1).to(device) w_mean = w_mean.view(-1).to(device) - + + logger.warning("Searching for best scale") for ratio in range(n_grid): # create new scales ratio = ratio / n_grid @@ -450,6 +485,7 @@ def _search_best_clip(self, layer, named_linears, input_feat): avoid_clipping = ["q_", "k_", "query", "key", "Wqkv"] for name in named_linears: + logger.warning(f"Searching for best clip: {name}") # due to qk bmm, it is hard to clip precisely if any([_ in name for _ in avoid_clipping]): continue @@ -594,6 +630,7 @@ def forward(self, *args, **kwargs): return modules, layer_kwargs, inps def _get_input_feat(self, layer, named_linears): + logger.warning("Computing input features for layer %s", layer) # firstly, get input features of all linear layers def cache_input_hook(m, x, y, name, feat_dict): x = x[0] diff --git a/awq/quantize/scale.py b/awq/quantize/scale.py index d3b5e266..5fdf31ca 100644 --- a/awq/quantize/scale.py +++ b/awq/quantize/scale.py @@ -1,3 +1,5 @@ +from turtle import ht +from venv import logger import torch import torch.nn as nn from typing import Tuple, List @@ -10,6 +12,7 @@ from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm from transformers.models.cohere.modeling_cohere import CohereLayerNorm from transformers.activations import NewGELUActivation, PytorchGELUTanh, GELUActivation +import habana_frameworks.torch.core as htcore allowed_norms = [nn.LayerNorm, LlamaRMSNorm, GemmaRMSNorm, Gemma2RMSNorm, CohereLayerNorm] allowed_act_fns = [ @@ -36,6 +39,7 @@ def apply_clip(module, clip_list: Tuple[str, torch.Tensor]): def apply_scale(module, scales_list, input_feat_dict=None): for prev_op_name, layer_names, scales in scales_list: + logger.warning(f"Apply scale {prev_op_name} -> {layer_names}") prev_op = get_op_by_name(module, prev_op_name) layers = [get_op_by_name(module, name) for name in layer_names] @@ -77,7 +81,7 @@ def apply_scale(module, scales_list, input_feat_dict=None): if layer_name in input_feat_dict: inp = input_feat_dict[layer_name] inp.div_(scales.view(1, -1).to(inp.device)) - + htcore.mark_step() prev_op.cpu() for layer in layers: layer.cpu() diff --git a/awq/utils/utils.py b/awq/utils/utils.py index 7553c5df..5c631e8e 100644 --- a/awq/utils/utils.py +++ b/awq/utils/utils.py @@ -73,6 +73,10 @@ def clear_memory(weight=None): if weight is not None: del weight gc.collect() + if is_hpex_available(): + # import habana_frameworks.torch.core as htcore + # torch.hpu.empty_cache() + return torch.cuda.empty_cache() @@ -86,9 +90,20 @@ def compute_memory_used_pct(device): return memory_pct +def is_hpex_available(): + try: + import habana_frameworks.torch.core as htcore + HPEX_AVAILABLE = True + except ImportError: + HPEX_AVAILABLE = False + return HPEX_AVAILABLE + def get_best_device(): if torch.backends.mps.is_available(): return "mps" + elif is_hpex_available(): + # FIXME: return device name with index? + return "hpu" elif torch.cuda.is_available(): return "cuda:0" else: diff --git a/examples/quantize.py b/examples/quantize.py index aa45b78a..efd8383b 100644 --- a/examples/quantize.py +++ b/examples/quantize.py @@ -1,21 +1,97 @@ +from regex import R +import torch from awq import AutoAWQForCausalLM +# from awq.models._config import AWQConfig from transformers import AutoTokenizer +from torchutils.eval import eval_wikitext2 +from torchutils.freeze import freeze_seed +freeze_seed() model_path = 'mistralai/Mistral-7B-Instruct-v0.2' -quant_path = 'mistral-instruct-v0.2-awq' +model_path = 'facebook/opt-125m' + +model_path = 'meta-llama/Llama-2-7b-chat-hf' +# model_path = "Qwen/Qwen1.5-0.5B-Chat" +model_path = "Qwen/Qwen2.5-7B-Instruct" +""" +perplexity 6.7588 +time 3.866 sec +{'perplexity': 6.7588, 'prediction_time': 3.866} +perplexity 7.2708 +time 3.567 sec +{'perplexity': 7.2708, 'prediction_time': 3.567} +""" + +model_path = 'Qwen/Qwen1.5-0.5B' +""" +perplexity 15.3238 +time 2.856 sec +{'perplexity': 15.3238, 'prediction_time': 2.856} + + +perplexity 14.8191 +time 3.566 sec +{'perplexity': 14.8191, 'prediction_time': 3.566} + +perplexity 14.8191 +time 3.703 sec +{'perplexity': 14.8191, 'prediction_time': 3.703} +""" + +quant_path = f"{model_path.replace('/', '__')}-quantized" quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } +limit = 20 # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, low_cpu_mem_usage=True, use_cache=False + model_path, + torch_dtype=torch.float32, + **{"low_cpu_mem_usage": True, "use_cache": False} ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) -# Quantize + +# import habana_frameworks.torch.core as htcore +device = torch.device("hpu") + +# result = eval_wikitext2(model.to(device), tokenizer, verbose=True, limit=limit) + + + +# # Quantize +# from awq.quantize.quantizer import AwqQuantizer + + model.quantize(tokenizer, quant_config=quant_config) +breakpoint() +model = model +delattr(model, "quantizer") + +from neural_compressor.torch.quantization import prepare, convert, quantize, RTNConfig, get_default_rtn_config + +quant_config = RTNConfig(use_sym=False, bits=4, group_size=128) +q_model = quantize(model, quant_config=quant_config) +# model = prepare(model, RTNConfig(use_sym=False, bits=4, group_size=128)) +# qmodel = convert(model, RTNConfig()) + +result = eval_wikitext2(q_model.to(device), tokenizer, verbose=True, limit=limit) +""" +perplexity 16.7339 +time 3.566 sec +{'perplexity': 16.7339, 'prediction_time': 3.566} +""" +# # Quantize +# # # Save quantized model +# model.to("cpu") +# model.save_quantized(quant_path) +# tokenizer.save_pretrained(quant_path) + +# print(f'Model is quantized and saved at "{quant_path}"') + -# Save quantized model -model.save_quantized(quant_path) -tokenizer.save_pretrained(quant_path) -print(f'Model is quantized and saved at "{quant_path}"') \ No newline at end of file +# model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False) +# tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=False, **{"low_cpu_mem_usage": True, "use_cache": False}) +# model.cpu() +# result_awq_reload_qmodel = eval_wikitext2(model.to(device), tokenizer, limit=limit) +# print(f"AWQ reloaded model perplexity: {result_awq_reload_qmodel}") \ No newline at end of file diff --git a/setup.py b/setup.py index 3b631933..194ea176 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,21 @@ from setuptools import setup, find_packages from torch.utils.cpp_extension import CUDAExtension + +def is_hpex_available(): + try: + import habana_frameworks.torch.core as htcore + return True + except ImportError: + return False + +HPEX_AVAILABLE = is_hpex_available() + + AUTOAWQ_VERSION = "0.2.6" PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1" INSTALL_KERNELS = os.getenv("INSTALL_KERNELS", "0") == "1" -IS_CPU_ONLY = not torch.backends.mps.is_available() and not torch.cuda.is_available() +IS_CPU_ONLY = not torch.backends.mps.is_available() and not torch.cuda.is_available() and not HPEX_AVAILABLE TORCH_VERSION = str(os.getenv("TORCH_VERSION", None) or torch.__version__).split('+', maxsplit=1)[0] CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda @@ -19,6 +30,7 @@ ROCM_VERSION_LEN = min(len(ROCM_VERSION.split(".")), 3) ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:ROCM_VERSION_LEN] + if not PYPI_BUILD: if IS_CPU_ONLY: AUTOAWQ_VERSION += "+cpu" @@ -26,6 +38,8 @@ AUTOAWQ_VERSION += f"+cu{CUDA_VERSION}" elif ROCM_VERSION: AUTOAWQ_VERSION += f"+rocm{ROCM_VERSION}" + elif HPEX_AVAILABLE: + AUTOAWQ_VERSION += "+hpu" else: raise RuntimeError( "Your system must have either Nvidia or AMD GPU to build this package." @@ -76,9 +90,9 @@ except ImportError: KERNELS_INSTALLED = False + if not KERNELS_INSTALLED and CUDA_VERSION and INSTALL_KERNELS and CUDA_VERSION.startswith("12"): requirements.append("autoawq-kernels") - elif IS_CPU_ONLY: requirements.append("intel-extension-for-pytorch>=2.4.0")