[Dev] Bump Version to dev0.8 and fix issue INT8xINT2 (apache#49)

LeiWang1999 · web-flow · commit 4cac65a1e246 · 2024-06-05T20:01:33.000+08:00
* improve e4m3 decoding.

* append fp16xint1

* Update submodule commit reference

* chore: Update shared memory scope for float32 output dtype

* BUGFIX: UINT8/INT8 Decoding

* feat: Add rasterization options for roller module

* Refactor tensorcore_legalization method to optimize tensor core usage

* feat: Add function to collect variables from expression, improve for splitk

* chore: Update typing import in __init__.py

* chore: Refactor CPU execution of operators

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* chore: Update version to 0.0.1.dev8

---------

Co-authored-by: LeiWang199 &lt;leiwang199&gt;
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.1.dev7
+0.0.1.dev8
diff --git a/integration/BitNet/utils_quant.py b/integration/BitNet/utils_quant.py
@@ -119,7 +119,6 @@ def native_forward(self, input):
         return out
 
     def forward_fp32_simulated(self, input):
-        print("input: ", input)
         quant_input = self.activation_quant(input, self.input_bits).detach()
         quant_weight = self.weight_quant(self.weight).detach()
 
@@ -139,6 +138,8 @@ def forward_fp32_simulated(self, input):
         return out
 
     def forward(self, input):
+        # return self.forward_fp32_simulated(input)
+
         quant_input = self.activation_quant(input, self.input_bits).detach()
         fp32_out = self.bitblas_matmul(quant_input, self.weight)
         sw = self.sw
diff --git a/python/bitblas/__init__.py b/python/bitblas/__init__.py
@@ -81,4 +81,4 @@ def _init_logger():
 
 _init_logger()
 
-__version__ = "0.0.1.dev7"
+__version__ = "0.0.1.dev8"
diff --git a/python/bitblas/base/utils.py b/python/bitblas/base/utils.py
@@ -19,7 +19,7 @@
 import tempfile
 import itertools
 from tvm.ir.supply import GlobalVarSupply
-from bitblas.utils import tensor_replace_dp4a, tensor_remove_make_int4
+from bitblas.utils import tensor_replace_dp4a, tensor_remove_make_int4, tensor_remove_make_int2
 import logging
 
 logger = logging.getLogger(__name__)
@@ -205,6 +205,7 @@ def _build(context) -> str:
         def tvm_callback_cuda_postproc(code, _):
             code = tensor_replace_dp4a(code)
             code = tensor_remove_make_int4(code)
+            code = tensor_remove_make_int2(code)
             return code
 
         with tvm.transform.PassContext(config={"tir.use_async_copy": True, **config.pass_context}):
diff --git a/python/bitblas/ops/general_matmul.py b/python/bitblas/ops/general_matmul.py
@@ -10,7 +10,7 @@
 from .impl.matmul_dequantize_impl import (
     select_implementation as weight_dequantize_implementation,)
 from .impl.matmul_impl import select_implementation as consistent_implementation
-from ..base.utils import tensor_replace_dp4a, tensor_remove_make_int4
+from ..base.utils import tensor_replace_dp4a, tensor_remove_make_int4, tensor_remove_make_int2
 from bitblas.utils.target_detector import auto_detect_nvidia_target
 from dataclasses import dataclass
 from .ladder_permutate import LadderPermutate, LadderPermutateConfig
@@ -398,6 +398,7 @@ def _select_implementation(self):
     def post_process(self, code: str) -> str:
         code = tensor_replace_dp4a(code)
         code = tensor_remove_make_int4(code)
+        code = tensor_remove_make_int2(code)
         return code
 
     def retrieve_weight_shape(self):
diff --git a/python/bitblas/ops/matmul.py b/python/bitblas/ops/matmul.py
@@ -7,7 +7,7 @@
 from typing import List, Union, Optional, Any, Tuple
 from .operator import Operator, TransformKind
 from .impl.matmul_impl import select_implementation
-from bitblas.utils import tensor_replace_dp4a, tensor_remove_make_int4
+from bitblas.utils import tensor_replace_dp4a, tensor_remove_make_int4, tensor_remove_make_int2
 from dataclasses import dataclass
 from .ladder_permutate import LadderPermutate, LadderPermutateConfig
 import logging
@@ -189,6 +189,7 @@ def _select_implementation(self):
     def post_process(self, code: str) -> str:
         code = tensor_replace_dp4a(code)
         code = tensor_remove_make_int4(code)
+        code = tensor_remove_make_int2(code)
         return code
 
     def _profile_latency_with_dynamic_range(self) -> List:
diff --git a/python/bitblas/ops/matmul_dequantize.py b/python/bitblas/ops/matmul_dequantize.py
@@ -6,7 +6,7 @@
 from typing import Any, List, Literal, Optional, Tuple, Union
 from .operator import Operator, TransformKind
 from .impl.matmul_dequantize_impl import select_implementation
-from ..base.utils import tensor_replace_dp4a, tensor_remove_make_int4
+from ..base.utils import tensor_replace_dp4a, tensor_remove_make_int4, tensor_remove_make_int2
 from bitblas.utils.tensor_adapter import tvm_tensor_to_torch
 from dataclasses import dataclass
 from .ladder_permutate import LadderPermutate, LadderPermutateConfig
@@ -234,6 +234,7 @@ def _select_implementation(self):
     def post_process(self, code: str) -> str:
         code = tensor_replace_dp4a(code)
         code = tensor_remove_make_int4(code)
+        code = tensor_remove_make_int2(code)
         return code
 
     def retrieve_weight_shape(self):
diff --git a/python/bitblas/utils/__init__.py b/python/bitblas/utils/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-from .post_process import match_global_kernel, tensor_replace_dp4a, tensor_remove_make_int4  # noqa: F401
+from .post_process import match_global_kernel, tensor_replace_dp4a, tensor_remove_make_int4, tensor_remove_make_int2  # noqa: F401
 from .tensor_adapter import tvm_tensor_to_torch, lazy_tvm_tensor_to_torch, lazy_torch_to_tvm_tensor  # noqa: F401
 from .target_detector import get_all_nvidia_targets, auto_detect_nvidia_target  # noqa: F401
diff --git a/python/bitblas/utils/post_process.py b/python/bitblas/utils/post_process.py
@@ -27,3 +27,12 @@ def tensor_remove_make_int4(source: str) -> str:
         "make_int4(0, 0, 0, 0)",
     )
     return source
+
+def tensor_remove_make_int2(source: str) -> str:
+    # remove make_int4 with 16 signed char arguments
+    # TODO(lei): this is a stuff that should be fixed in the tvm in the future
+    source = source.replace(
+        "make_int2((signed char)0, (signed char)0, (signed char)0, (signed char)0, (signed char)0, (signed char)0, (signed char)0, (signed char)0)",
+        "make_int2(0, 0)",
+    )
+    return source

Original file line number	Diff line number	Diff line change
`@@ -81,4 +81,4 @@ def _init_logger():`
`81`	`81`
`82`	`82`	`_init_logger()`
`83`	`83`
`84`		`-__version__ = "0.0.1.dev7"`
	`84`	`+__version__ = "0.0.1.dev8"`