Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use new layout for xpu qlinear #8896

Merged
merged 2 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions python/llm/src/bigdl/llm/ggml/model/llama/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,42 @@ def ggml_dequantize_q4_0(
_lib.ggml_quantize_q4_0.restype = None


def ggml_q_format_convet_cpu2xpu(
src: ctypes.c_void_p,
dst: ctypes.c_void_p,
n: ctypes.c_int,
qtype: ctypes.c_int
):
_lib.ggml_q_format_convet_cpu2xpu(src, dst, n, qtype)


_lib.ggml_q_format_convet_cpu2xpu.argtypes = [
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
]
_lib.ggml_q_format_convet_cpu2xpu.restype = None


def ggml_q_format_convet_xpu2cpu(
src: ctypes.c_void_p,
dst: ctypes.c_void_p,
n: ctypes.c_int,
qtype: ctypes.c_int
):
_lib.ggml_q_format_convet_xpu2cpu(src, dst, n, qtype)


_lib.ggml_q_format_convet_xpu2cpu.argtypes = [
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int
]
_lib.ggml_q_format_convet_xpu2cpu.restype = None


# def ggml_dequantize_nf4(
# src: ctypes.c_void_p,
# dst: ctypes.c_void_p,
Expand Down
55 changes: 51 additions & 4 deletions python/llm/src/bigdl/llm/transformers/low_bit_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
import torch
import torch.nn.functional as F
from torch import Tensor, device, dtype, nn
from operator import mul
from functools import reduce

T = TypeVar("T", bound="torch.nn.Module")

Expand Down Expand Up @@ -86,6 +88,38 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=None):
return dst_tensor


def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int):

invalidInputError(tensor.dtype == torch.uint8,
"Input tensor must be uint8")

invalidInputError(tensor.device == torch.device('cpu'),
"Input tensor must be uint8")

src = ctypes.c_void_p(tensor.data.data_ptr())

dst_tensor = torch.empty_like(tensor)
dst = ctypes.c_void_p(dst_tensor.data.data_ptr())
ggml.ggml_q_format_convet_cpu2xpu(src, dst, num_elem, qtype)
return dst_tensor


def ggml_q_format_convet_xpu2cpu(tensor: torch.Tensor, num_elem: int, qtype: int):

invalidInputError(tensor.dtype == torch.uint8,
"Input tensor must be uint8")

invalidInputError(tensor.device == torch.device('cpu'),
"Input tensor must be uint8")

src = ctypes.c_void_p(tensor.data.data_ptr())

dst_tensor = torch.empty_like(tensor)
dst = ctypes.c_void_p(dst_tensor.data.data_ptr())
ggml.ggml_q_format_convet_xpu2cpu(src, dst, num_elem, qtype)
return dst_tensor


def ggml_int4_convert_fp32(tensor: torch.Tensor, weight_shape: tuple, k: int):
invalidInputError(tensor.dtype == torch.uint8,
"Input tensor must be uint8")
Expand All @@ -106,7 +140,6 @@ class FP4Params(torch.nn.Parameter):
def __new__(cls,
data=None,
requires_grad=False,
old_data=None,
quantized=False,
_shape=None,
qtype=None):
Expand Down Expand Up @@ -154,14 +187,29 @@ def to(self, *args, **kwargs):
return self.quantize(device.type)
elif (device is not None and device.type == "xpu" and self.data.device.type == "cpu"):
# enter xpu logic, compile linear_int4 extension at first time
q_tensor = self.quantize(device) # tensor is cpu now
self.quantize(device) # tensor is cpu now
self.data = ggml_q_format_convet_cpu2xpu(self.data,
reduce(mul, self._shape, 1),
self.qtype)
new_param = FP4Params(super().to(device=device,
dtype=dtype,
non_blocking=non_blocking),
requires_grad=self.requires_grad,
quantized=self.quantized,
_shape=self._shape,
qtype=self.qtype)
return new_param
elif (device is not None and device.type == "cpu" and self.data.device.type == "xpu"):
new_param = FP4Params(super().to(device=device,
dtype=dtype,
non_blocking=non_blocking),
requires_grad=self.requires_grad,
quantized=self.quantized,
_shape=self._shape,
qtype=self.qtype)
new_param.data = ggml_q_format_convet_xpu2cpu(new_param.data,
reduce(mul, new_param._shape, 1),
new_param.qtype)
return new_param
else:
new_param = FP4Params(super().to(device=device,
Expand Down Expand Up @@ -217,7 +265,6 @@ def __init__(self, input_features, output_features, qtype, bias=True):
super().__init__(input_features, output_features, bias)
self.weight = FP4Params(self.weight.data,
requires_grad=False,
old_data=self.weight.data,
quantized=False, _shape=None, qtype=qtype)
self.in_len = input_features
self.out_len = output_features
Expand Down Expand Up @@ -249,7 +296,7 @@ def forward(self, x: torch.Tensor):
if x_2d.shape[0] > 1 and x_2d.dtype == torch.float32:
x_2d = x_2d.half()
# input format of linear_q4.forward is 1: input, 2: weight
result = linear_q4_0.forward(x_2d, x0, self.qtype)
result = linear_q4_0.forward_new(x_2d, x0, self.qtype)
new_shape = x_shape[:-1] + (self.out_len,)
result = result.view(new_shape)
if self.bias is not None:
Expand Down
Loading