From efc071c0499b5f12481caa9c0a8d95edb2828d19 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 22 Oct 2024 09:21:04 +0300
Subject: [PATCH] check awq on hpu

Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 Makefile                   |   4 ++
 awq/modules/linear/gemm.py |   9 ++-
 awq/quantize/quantizer.py  | 127 ++++++++++++++++++++++++-------------
 awq/quantize/scale.py      |   6 +-
 awq/utils/utils.py         |  15 +++++
 examples/quantize.py       |  90 ++++++++++++++++++++++++--
 setup.py                   |  18 +++++-
 7 files changed, 212 insertions(+), 57 deletions(-)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..c94dc841
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+build:
+	CUDA_VISIBLE_DEVICES=-1 pip install -e . -vvv
+	
+.PHONY: build
\ No newline at end of file
diff --git a/awq/modules/linear/gemm.py b/awq/modules/linear/gemm.py
index 7ee89cc8..3a376336 100644
--- a/awq/modules/linear/gemm.py
+++ b/awq/modules/linear/gemm.py
@@ -5,6 +5,8 @@
 from awq.utils.module import try_import
 from awq.utils.utils import get_best_device
 from awq.utils.packing_utils import dequantize_gemm
+import logging
+logger = logging.getLogger(__name__)
 
 # NOTE: We check if awq_ext or triton is available. awq_ext will be preferred if both are installed.
 
@@ -199,6 +201,7 @@ def from_linear(
                     / awq_linear.scales[idx // group_size]
                 ).to(torch.int)[:, None]
             )
+        logger.warning("Got int weight...")
         intweight = torch.cat(intweight, dim=1)
         intweight = intweight.t().contiguous()
         intweight = intweight.to(dtype=torch.int32)
@@ -225,7 +228,7 @@ def from_linear(
                 qweight[:, col] |= qweight_col << (i * awq_linear.w_bit)
         awq_linear.qweight = qweight
 
-        zeros = zeros.to(dtype=torch.int32, device=best_device)
+        zeros = zeros.to(dtype=torch.int32, device="cpu")
 
         if "mps" in best_device:
             zeros = zeros.to("cpu")
@@ -235,7 +238,7 @@ def from_linear(
             dtype=torch.int32,
             device=zeros.device,
         )
-
+        logger.warning("PACK Qzeros...")
         for col in range(zeros.shape[1] // pack_num):
             if awq_linear.w_bit == 4:
                 order_map = [0, 2, 4, 6, 1, 3, 5, 7]
@@ -244,7 +247,9 @@ def from_linear(
             for i in range(pack_num):
                 qzero_col = zeros[:, col * pack_num + order_map[i]]
                 qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit)
+        logger.warning("PACK Qzeros done...")
         awq_linear.qzeros = qzeros
+        awq_linear = awq_linear.to(best_device)
 
         return awq_linear
 
diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
index cd9fb0dd..9dbdbf35 100644
--- a/awq/quantize/quantizer.py
+++ b/awq/quantize/quantizer.py
@@ -22,7 +22,9 @@
     set_op_by_name,
     exclude_layers_to_not_quantize,
 )
-
+import logging
+logger = logging.getLogger(__name__)
+import habana_frameworks.torch.core as htcore
 
 class AwqQuantizer:
     def __init__(
@@ -70,13 +72,17 @@ def __init__(
             n_samples=self.max_calib_samples, max_seq_len=self.max_calib_seq_len
         )
 
-    def pseudo_quantize_tensor(self, w: torch.Tensor):
+    def pseudo_quantize_tensor(self, w: torch.Tensor, return_int=False):
         org_w_shape = w.shape
         if self.group_size > 0:
             assert org_w_shape[-1] % self.group_size == 0
             w = w.reshape(-1, self.group_size)
+        if torch.isnan(w).sum() > 0:
+            breakpoint()
+            logging.error(f"Found {torch.isnan(w).sum()} NaNs in weight matrix")
         assert w.dim() == 2
         assert torch.isnan(w).sum() == 0
+        # breakpoint()
 
         # zero point quantization
         if self.zero_point:
@@ -86,9 +92,8 @@ def pseudo_quantize_tensor(self, w: torch.Tensor):
             min_int = 0
             scales = (max_val - min_val).clamp(min=1e-5) / max_int
             zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
-            w = (
-                torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros
-            ) * scales
+            w_int = torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros
+            w = w_int * scales
             zeros = zeros.view(org_w_shape[0], -1)
         else:
             max_val = w.abs().amax(dim=1, keepdim=True)
@@ -97,14 +102,19 @@ def pseudo_quantize_tensor(self, w: torch.Tensor):
             min_int = -(2 ** (self.w_bit - 1))
             scales = max_val / max_int
             zeros = None
-            w = torch.clamp(torch.round(w / scales), min_int, max_int) * scales
-
+            w_int = torch.clamp(torch.round(w / scales), min_int, max_int)
+            w = w_int * scales
+        if torch.isnan(w).sum() > 0:
+            breakpoint()
+            logging.error(f"Found {torch.isnan(w).sum()} NaNs in weight matrix {w.shape}")
         assert torch.isnan(scales).sum() == 0
         assert torch.isnan(w).sum() == 0
 
         scales = scales.view(org_w_shape[0], -1)
         w = w.reshape(org_w_shape)
-
+        
+        if return_int:
+            return w, scales, zeros, w_int.reshape(org_w_shape)
         return w, scales, zeros
 
     def pseudo_dequantize_tensor(
@@ -124,7 +134,10 @@ def pseudo_dequantize_tensor(
         return w
 
     def quantize(self):
+        self._num_modules = len(self.modules)
         for i in tqdm(range(len(self.modules)), desc="AWQ"):
+            # if i > 1:
+            #     return
             # Move module and inputs to correct device
             common_device = next(self.modules[i].parameters()).device
             if common_device is None or str(common_device) == "cpu":
@@ -171,6 +184,7 @@ def quantize(self):
             scales_list = append_str_prefix(
                 scales_list, get_op_name(self.model, self.modules[i]) + "."
             )
+            logger.warning(f"Applied scales: {scales_list}")
 
             # [STEP 3]: Compute and apply clipping list
             if self.apply_clip:
@@ -199,43 +213,61 @@ def pack(self):
 
     def _apply_quant(self, module, named_linears: Dict[str, nn.Linear]):
         for name, linear_layer in named_linears.items():
-            # NOTE: small regression in perplexity if linear layer uses .cpu().float()
-            linear_layer = linear_layer.to(get_best_device()).half()
-
-            linear_layer.weight.data, scales, zeros = self.pseudo_quantize_tensor(
-                linear_layer.weight.data
-            )
-
-            if self.version == "gemm":
-                scales = scales.t().contiguous()
-                if zeros is not None:
-                    zeros = zeros.t().contiguous()
-                q_linear_module = WQLinear_GEMM
-
-            elif self.version == "gemv":
-                q_linear_module = WQLinear_GEMV
-
-            elif self.version == "marlin":
-                q_linear_module = WQLinear_Marlin
-
-            elif self.version == "gemv_fast":
-                q_linear_module = WQLinear_GEMVFast
-
-            else:
-                raise ValueError(f"Unknown version {self.version}")
-
-            q_linear = q_linear_module.from_linear(
-                linear=linear_layer,
-                w_bit=self.w_bit,
-                group_size=self.group_size,
-                init_only=False,
-                scales=scales,
-                zeros=zeros,
-            )
-
-            linear_layer.cpu()
-            q_linear.to(next(module.parameters()).device)
+            logger.warning(f"Quantizing {name}")
+            # linear_layer = linear_layer.cpu().half()
+            # # NOTE: small regression in perplexity if linear layer uses .cpu().float()
+            # # linear_layer = linear_layer.to(get_best_device()).half()
+
+            # linear_layer.weight.data, scales, zeros = self.pseudo_quantize_tensor(
+            #     linear_layer.weight.data
+            # )
+
+            # if self.version == "gemm":
+            #     scales = scales.t().contiguous()
+            #     if zeros is not None:
+            #         zeros = zeros.t().contiguous()
+            #     q_linear_module = WQLinear_GEMM
+
+            # elif self.version == "gemv":
+            #     q_linear_module = WQLinear_GEMV
+
+            # elif self.version == "marlin":
+            #     q_linear_module = WQLinear_Marlin
+
+            # elif self.version == "gemv_fast":
+            #     q_linear_module = WQLinear_GEMVFast
+
+            # else:
+            #     raise ValueError(f"Unknown version {self.version}")
+            # linear_layer = linear_layer.cpu()
+            # from neural_compressor.torch.algorithms.weight_only.rtn import RTNQuantizer
+            # from neural_compressor.torch.quantization.config import RTNConfig
+            # config = RTNConfig(group_size=self.group_size, bits=self.w_bit)
+            # config_dict = config.to_dict()
+            # config_dict["scheme"] = "sym"  # ?
+            # rtn_quantizer = RTNQuantizer(quant_config={'': config_dict})
+            # q_linear = rtn_quantizer.quantize(linear_layer)
+            # # breakpoint()
+            # # breakpoint()
+            
+            # # q_linear = linear_layer
+
+            # # q_linear = q_linear_module.from_linear(
+            # #     linear=linear_layer,
+            # #     w_bit=self.w_bit,
+            # #     group_size=self.group_size,
+            # #     init_only=False,
+            # #     scales=scales,
+            # #     zeros=zeros,
+            # # )
+            # logger.warning(f"got q_linear {q_linear}")
+
+            # linear_layer.cpu()
+            # q_linear.to(next(module.parameters()).device)
+            q_linear = linear_layer
             set_op_by_name(module, name, q_linear)
+            # set_op_by_name(module, name, q_linear)
+            logger.warning(f"update {name} to {q_linear}")
             clear_memory()
 
     @torch.no_grad()
@@ -325,11 +357,13 @@ def _search_best_scale(
         with torch.no_grad():
             module_kwargs = self._sanitize_kwargs(kwargs, module2inspect)
             fp16_output = self._module_forward(inp, module2inspect, module_kwargs)
+            htcore.mark_step()
 
         # [STEP 4]: Compute loss
         best_scales = self._compute_best_scale(
             inp, w_mean, x_mean, module2inspect, layers, fp16_output, module_kwargs
         )
+        htcore.mark_step()
 
         return (
             get_op_name(module, prev_op),
@@ -367,7 +401,8 @@ def _compute_best_scale(
         device = x.device
         x_mean = x_mean.view(-1).to(device)
         w_mean = w_mean.view(-1).to(device)
-
+        
+        logger.warning("Searching for best scale")
         for ratio in range(n_grid):
             # create new scales
             ratio = ratio / n_grid
@@ -450,6 +485,7 @@ def _search_best_clip(self, layer, named_linears, input_feat):
         avoid_clipping = ["q_", "k_", "query", "key", "Wqkv"]
 
         for name in named_linears:
+            logger.warning(f"Searching for best clip: {name}")
             # due to qk bmm, it is hard to clip precisely
             if any([_ in name for _ in avoid_clipping]):
                 continue
@@ -594,6 +630,7 @@ def forward(self, *args, **kwargs):
         return modules, layer_kwargs, inps
 
     def _get_input_feat(self, layer, named_linears):
+        logger.warning("Computing input features for layer %s", layer)
         # firstly, get input features of all linear layers
         def cache_input_hook(m, x, y, name, feat_dict):
             x = x[0]
diff --git a/awq/quantize/scale.py b/awq/quantize/scale.py
index d3b5e266..5fdf31ca 100644
--- a/awq/quantize/scale.py
+++ b/awq/quantize/scale.py
@@ -1,3 +1,5 @@
+from turtle import ht
+from venv import logger
 import torch
 import torch.nn as nn
 from typing import Tuple, List
@@ -10,6 +12,7 @@
 from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm
 from transformers.models.cohere.modeling_cohere import CohereLayerNorm
 from transformers.activations import NewGELUActivation, PytorchGELUTanh, GELUActivation
+import habana_frameworks.torch.core as htcore
 
 allowed_norms = [nn.LayerNorm, LlamaRMSNorm, GemmaRMSNorm, Gemma2RMSNorm, CohereLayerNorm]
 allowed_act_fns = [
@@ -36,6 +39,7 @@ def apply_clip(module, clip_list: Tuple[str, torch.Tensor]):
 
 def apply_scale(module, scales_list, input_feat_dict=None):
     for prev_op_name, layer_names, scales in scales_list:
+        logger.warning(f"Apply scale {prev_op_name} -> {layer_names}")
         prev_op = get_op_by_name(module, prev_op_name)
         layers = [get_op_by_name(module, name) for name in layer_names]
 
@@ -77,7 +81,7 @@ def apply_scale(module, scales_list, input_feat_dict=None):
                 if layer_name in input_feat_dict:
                     inp = input_feat_dict[layer_name]
                     inp.div_(scales.view(1, -1).to(inp.device))
-
+        htcore.mark_step()
         prev_op.cpu()
         for layer in layers:
             layer.cpu()
diff --git a/awq/utils/utils.py b/awq/utils/utils.py
index 7553c5df..5c631e8e 100644
--- a/awq/utils/utils.py
+++ b/awq/utils/utils.py
@@ -73,6 +73,10 @@ def clear_memory(weight=None):
     if weight is not None:
         del weight
     gc.collect()
+    if is_hpex_available():
+        # import habana_frameworks.torch.core as htcore
+        # torch.hpu.empty_cache()
+        return 
     torch.cuda.empty_cache()
 
 
@@ -86,9 +90,20 @@ def compute_memory_used_pct(device):
     return memory_pct
 
 
+def is_hpex_available():
+    try:
+        import habana_frameworks.torch.core as htcore
+        HPEX_AVAILABLE = True
+    except ImportError:
+        HPEX_AVAILABLE = False
+    return HPEX_AVAILABLE
+
 def get_best_device():
     if torch.backends.mps.is_available():
         return "mps"
+    elif is_hpex_available():
+        # FIXME: return device name with index?
+        return "hpu"
     elif torch.cuda.is_available():
         return "cuda:0"
     else:
diff --git a/examples/quantize.py b/examples/quantize.py
index aa45b78a..efd8383b 100644
--- a/examples/quantize.py
+++ b/examples/quantize.py
@@ -1,21 +1,97 @@
+from regex import R
+import torch
 from awq import AutoAWQForCausalLM
+# from awq.models._config import AWQConfig
 from transformers import AutoTokenizer
+from torchutils.eval import eval_wikitext2
+from torchutils.freeze import freeze_seed
+freeze_seed()
 
 model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-quant_path = 'mistral-instruct-v0.2-awq'
+model_path = 'facebook/opt-125m'
+
+model_path = 'meta-llama/Llama-2-7b-chat-hf'
+# model_path = "Qwen/Qwen1.5-0.5B-Chat"
+model_path = "Qwen/Qwen2.5-7B-Instruct"
+"""
+perplexity 6.7588
+time 3.866  sec
+{'perplexity': 6.7588, 'prediction_time': 3.866}
+perplexity 7.2708
+time 3.567  sec
+{'perplexity': 7.2708, 'prediction_time': 3.567}
+"""
+
+model_path = 'Qwen/Qwen1.5-0.5B'
+"""
+perplexity 15.3238
+time 2.856  sec
+{'perplexity': 15.3238, 'prediction_time': 2.856}
+
+
+perplexity 14.8191
+time 3.566  sec
+{'perplexity': 14.8191, 'prediction_time': 3.566}
+
+perplexity 14.8191
+time 3.703  sec
+{'perplexity': 14.8191, 'prediction_time': 3.703}
+"""
+
+quant_path = f"{model_path.replace('/', '__')}-quantized"
 quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
 
+limit = 20
 # Load model
 model = AutoAWQForCausalLM.from_pretrained(
-    model_path, low_cpu_mem_usage=True, use_cache=False
+    model_path,
+    torch_dtype=torch.float32,
+    **{"low_cpu_mem_usage": True, "use_cache": False}
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
-# Quantize
+
+# import habana_frameworks.torch.core as htcore
+device = torch.device("hpu")
+
+# result = eval_wikitext2(model.to(device), tokenizer, verbose=True, limit=limit)
+
+
+
+# # Quantize
+# from awq.quantize.quantizer import AwqQuantizer
+
+
 model.quantize(tokenizer, quant_config=quant_config)
+breakpoint()
+model = model
+delattr(model, "quantizer")
+        
+from neural_compressor.torch.quantization import prepare, convert, quantize, RTNConfig, get_default_rtn_config
+
+quant_config = RTNConfig(use_sym=False, bits=4, group_size=128)
+q_model = quantize(model, quant_config=quant_config)
+# model = prepare(model, RTNConfig(use_sym=False, bits=4, group_size=128))
+# qmodel = convert(model, RTNConfig())
+
+result = eval_wikitext2(q_model.to(device), tokenizer, verbose=True, limit=limit)
+"""
+perplexity 16.7339
+time 3.566  sec
+{'perplexity': 16.7339, 'prediction_time': 3.566}
+"""
+# # Quantize
+# # # Save quantized model
+# model.to("cpu")
+# model.save_quantized(quant_path)
+# tokenizer.save_pretrained(quant_path)
+
+# print(f'Model is quantized and saved at "{quant_path}"')
+
 
-# Save quantized model
-model.save_quantized(quant_path)
-tokenizer.save_pretrained(quant_path)
 
-print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
+# model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
+# tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=False, **{"low_cpu_mem_usage": True, "use_cache": False})
+# model.cpu()
+# result_awq_reload_qmodel = eval_wikitext2(model.to(device), tokenizer, limit=limit)
+# print(f"AWQ reloaded model perplexity: {result_awq_reload_qmodel}")
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 3b631933..194ea176 100644
--- a/setup.py
+++ b/setup.py
@@ -4,10 +4,21 @@
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import CUDAExtension
 
+
+def is_hpex_available():
+    try:
+        import habana_frameworks.torch.core as htcore
+        return True
+    except ImportError:
+        return False
+
+HPEX_AVAILABLE = is_hpex_available()
+
+
 AUTOAWQ_VERSION = "0.2.6"
 PYPI_BUILD = os.getenv("PYPI_BUILD", "0") == "1"
 INSTALL_KERNELS = os.getenv("INSTALL_KERNELS", "0") == "1"
-IS_CPU_ONLY = not torch.backends.mps.is_available() and not torch.cuda.is_available()
+IS_CPU_ONLY = not torch.backends.mps.is_available() and not torch.cuda.is_available() and not HPEX_AVAILABLE
 TORCH_VERSION = str(os.getenv("TORCH_VERSION", None) or torch.__version__).split('+', maxsplit=1)[0]
 
 CUDA_VERSION = os.getenv("CUDA_VERSION", None) or torch.version.cuda
@@ -19,6 +30,7 @@
     ROCM_VERSION_LEN = min(len(ROCM_VERSION.split(".")), 3)
     ROCM_VERSION = "".join(ROCM_VERSION.split("."))[:ROCM_VERSION_LEN]
 
+
 if not PYPI_BUILD:
     if IS_CPU_ONLY:
         AUTOAWQ_VERSION += "+cpu"
@@ -26,6 +38,8 @@
         AUTOAWQ_VERSION += f"+cu{CUDA_VERSION}"
     elif ROCM_VERSION:
         AUTOAWQ_VERSION += f"+rocm{ROCM_VERSION}"
+    elif HPEX_AVAILABLE:
+        AUTOAWQ_VERSION += "+hpu"
     else:
         raise RuntimeError(
             "Your system must have either Nvidia or AMD GPU to build this package."
@@ -76,9 +90,9 @@
 except ImportError:
     KERNELS_INSTALLED = False
 
+
 if not KERNELS_INSTALLED and CUDA_VERSION and INSTALL_KERNELS and CUDA_VERSION.startswith("12"):
     requirements.append("autoawq-kernels")
-
 elif IS_CPU_ONLY:
     requirements.append("intel-extension-for-pytorch>=2.4.0")