Update int4 weight with serialized format

yanbing-j · yanbing-j · commit 1ff170b71284 · 2024-07-01T05:21:25.000-04:00
diff --git a/generate.py b/generate.py
@@ -77,7 +77,6 @@ def decode_n_tokens(model: Transformer, cur_token: torch.Tensor, input_pos: torc
             callback(new_tokens[-1])
             new_probs.append(next_prob.clone())
             cur_token = next_token.view(1, -1)
-
     return new_tokens, new_probs
 
 
@@ -241,6 +240,13 @@ def _load_model(checkpoint_path, device, precision, use_tp):
         apply_tp(model)
 
     model = model.to(device=device, dtype=precision)
+    if "int4" in str(checkpoint_path):
+        from quantize import WeightOnlyInt4Linear
+        for fqn, mod in model.named_modules():
+            if isinstance(mod, WeightOnlyInt4Linear):
+                weight = mod.weight.data
+                weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(weight, mod.inner_k_tiles)
+                mod.weight = weight_int4pack
     return model.eval()
 
 def _get_model_size(model):
diff --git a/quantize.py b/quantize.py
@@ -124,8 +124,8 @@ def group_quantize_tensor_from_qparams(w, scales, zeros, n_bit=4, groupsize=128)
         .to(torch.int32)
         .reshape_as(w)
     )
-
-    return w_int32
+    w_uint8 = (w_int32[::,::2] << 4 | w_int32[::,1::2]).to(torch.uint8)
+    return w_uint8
 
 
 def group_quantize_tensor(w, n_bit=4, groupsize=128):
@@ -357,10 +357,9 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 ##### weight only int4 per channel groupwise quantized code ######
 
 def prepare_int4_weight_and_scales_and_zeros(weight_bf16, groupsize, inner_k_tiles):
-    weight_int32, scales_and_zeros = group_quantize_tensor(
+    weight_int4pack, scales_and_zeros = group_quantize_tensor(
         weight_bf16, n_bit=4, groupsize=groupsize
     )
-    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(weight_int32, inner_k_tiles)
     return weight_int4pack, scales_and_zeros
 
 
@@ -404,7 +403,7 @@ def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):
 
     @torch.no_grad()
     def create_quantized_state_dict(self, use_cuda = True):
-        if use_cuda:
+        if use_cuda and torch.cuda.is_available():
             device="cuda"
         else:
             device="cpu"
@@ -507,7 +506,7 @@ def __init__(
         assert in_features % (inner_k_tiles * 16) == 0, "require in_features % (innerKTiles * 16) == 0"
         self.register_buffer(
             "weight",
-            torch.empty((out_features // 8, in_features // (inner_k_tiles * 16), 32, inner_k_tiles // 2), dtype=torch.int32)
+            torch.empty((out_features, in_features // 2), dtype=torch.uint8)
         )
         self.register_buffer(
             "scales_and_zeros",