ModelCloud · Qubitium · Dec 19, 2024 · Dec 19, 2024 · Dec 20, 2024
diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py
@@ -79,7 +79,7 @@ def __init__(
         )
         self.register_buffer(
             "g_idx",
-            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.uint16),
         )
         if bias:
             self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
@@ -107,7 +107,7 @@ def post_init(self):
                 self.outfeatures // 32 * self.bits
             )
             self.scales.resize_((math.ceil(self.padded_infeatures / self.group_size), self.outfeatures), )
-            self.g_idx = torch.tensor([i // self.group_size for i in range(self.padded_infeatures)], dtype=torch.int32,
+            self.g_idx = torch.tensor([i // self.group_size for i in range(self.padded_infeatures)], dtype=torch.uint16,
                                       device=self.g_idx.device)
 
     def pack(self, linear, scales, zeros, g_idx=None):
@@ -174,7 +174,7 @@ def pack(self, linear, scales, zeros, g_idx=None):
         i = 0
         col = 0
         while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
+            if self.bits == 4 or self.bits == 8 or self.bits == 2:
                 for j in range(i, i + (32 // self.bits)):
                     qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
                 i += 32 // self.bits
@@ -213,7 +213,7 @@ def forward(self, x: torch.Tensor):
         if self.wf.device != self.qzeros.device:
             self.wf = self.wf.to(self.qzeros.device)
 
-        if self.bits in [2, 4, 8]:
+        if self.bits == 4 or self.bits in [8, 2]:
             zeros = torch.bitwise_right_shift(
                 torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
                 self.wf.unsqueeze(0),
@@ -251,23 +251,23 @@ def forward(self, x: torch.Tensor):
 
         weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
 
-        num_itr = self.g_idx.shape[0] // x.shape[-1]
-        if num_itr == 1:
-            weights = self.scales[self.g_idx.long()] * (weight - zeros[self.g_idx.long()])
+        count = self.g_idx.shape[0] // x.shape[-1]
+        if count == 1:
+            g_idx_int32 = self.g_idx.int()
+            weights = self.scales[g_idx_int32] * (weight - zeros[g_idx_int32])
         else:
-            num_dim = self.g_idx.shape[0] // num_itr
+            num_dim = self.g_idx.shape[0] // count
             weights = []
-            for i in range(num_itr):
+            for i in range(count):
                 scale_i = self.scales[:, i * num_dim : (i + 1) * num_dim]
                 weight_i = weight[:, i * num_dim : (i + 1) * num_dim]
                 zeros_i = zeros[:, i * num_dim : (i + 1) * num_dim]
-                g_idx_i = self.g_idx[i * num_dim : (i + 1) * num_dim]
-                weights.append(scale_i[g_idx_i.long()] * (weight_i - zeros_i[g_idx_i.long()]))
+                g_idx_i = self.g_idx[i * num_dim : (i + 1) * num_dim].int()
+                weights.append(scale_i[g_idx_i] * (weight_i - zeros_i[g_idx_i]))
             weights = torch.cat(weights, dim=1)
-        out = torch.matmul(x, weights)
-        out = out.to(x_dtype)
-        out = out.reshape(out_shape)
-        out = out + self.bias if self.bias is not None else out
+        out = torch.matmul(x, weights).reshape(out_shape)
+        if self.bias is not None:
+            out = out + self.bias
         return out