Skip to content

Commit

Permalink
Move QMat2 to buffer storage and scales_and_zeros to Channels Packed (#…
Browse files Browse the repository at this point in the history
…5515)

Summary:
Pull Request resolved: #5515

Storing QMat2 in a texture gives way to two main problems:

 - Indexing is a mess and additional computation is required to take into account the fact that we are reading ivec4's and only using half of the values
 - There is no texel fetching in int8. The texel is read in int32 and needs to be casted

Keeping QMat2 in a buffer performs better because, although reading from buffers is slower, removing the extra computation compensates for this.

 {F1863459327}

This diff also moves the scales_and_zeros tensor to Channels Packed in texture implementations because it just makes more sense, I had done some terrible indexing shennanigans before.

ghstack-source-id: 244258611
exported-using-ghexport

Reviewed By: yipjustin

Differential Revision: D62504978

fbshipit-source-id: df2fdf87f75140be0a316576c8ffad67feefd6d7
  • Loading branch information
SS-JIA authored and facebook-github-bot committed Sep 23, 2024
1 parent 8be3ce5 commit 2eae7a9
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 41 deletions.
47 changes: 18 additions & 29 deletions backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@ layout(std430) buffer;

${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)}
${layout_declare_tensor(2, "r", "t_mat2", "int8", STORAGE)}
${layout_declare_tensor(2, "r", "t_mat2", "int8", "buffer")}
${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)}

$if STORAGE == "texture3d":
${layout_declare_ubo(4, "ivec4", "out_sizes")}
${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
${layout_declare_ubo(6, "ivec4", "scales_strides")}
${layout_declare_ubo(6, "ivec4", "mat2_strides")}
${layout_declare_ubo(7, "ivec4", "scales_strides")}
$else:
${layout_declare_ubo(4, "ivec4", "out_sizes")}
${layout_declare_ubo(5, "ivec4", "out_strides")}
Expand Down Expand Up @@ -64,9 +65,9 @@ void main() {

float rc = 0.0;
int k = 0;
const uint k_block = (K + group_size - 1) / group_size;

#ifdef USING_BUFFER
const uint k_block = (K + group_size - 1) / group_size;
ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w);
ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
ivec4 scale_pos = ivec4(0, n, 0, out_pos.w);
Expand Down Expand Up @@ -101,42 +102,30 @@ void main() {
t_out[out_bufi] = FLOAT_T(rc);

#else // Using texture
const uint texel_group_size = group_size / FOUR;
const uint k_block = (K + texel_group_size - 1) / texel_group_size;
ivec3 mat1_pos = ivec3(0, m, out_pos.z);
ivec3 mat2_pos = ivec3(0, n, out_pos.z);
ivec3 scale_pos = ivec3(0, n, 0);
ivec3 zero_pos = ivec3(0, n, 1);
ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
ivec3 scale_zero_pos = ivec3(0, n, 0);
uint K_texel = K / FOUR;

for (int kb = 0; kb < k_block; kb++) {
const int texel_kb = kb / FOUR;
const int kb_offset = kb % FOUR;

scale_pos.x = texel_kb;
const VEC4_T scale_texel = load_texel(t_scales_and_zeros, scale_pos);
const float scale = float(scale_texel[kb_offset]);
scale_zero_pos.x = kb;
const vec4 scale_zero = load_texel(t_scales_and_zeros, scale_zero_pos);
const float scale = scale_zero.x;
const float zero = scale_zero.y - scale * 8.0;

zero_pos.x = texel_kb;
const VEC4_T zero_texel = load_texel(t_scales_and_zeros, zero_pos);
const float zero = float(zero_texel[kb_offset]) - scale * 8.0;

for(uint idx = 0; idx < texel_group_size && k < K; idx++, k++) {
for(uint idx = 0; idx < group_size && k < K_texel; idx += FOUR, k++) {
mat1_pos.x = k;
const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos);

mat2_pos.x = k / 2;
const i8vec4 mat2_tex = i8vec4(load_texel(t_mat2, mat2_pos));
mat2_pos.x = k * 2; // k * FOUR / 2
const int mat2_id = tidx_to_bufi(mat2_pos, mat2_strides);

// Every two texels of mat1 correspond to one texel of mat2
// Even mat1 indeces correspond to first half of mat2 texel and
// odd indeces correspond to second half
const int mat2_offset = (k & 1) == 0 ? 0 : 2;
for (int texel_idx = 0; texel_idx < FOUR; texel_idx++){
for (int texel_pos = 0; texel_pos < FOUR; texel_pos++) {
// Bitwise op treats sign bit from int8 as a value bit instead,
// since there is no uint8_t datatype
uint mat2_val = (mat2_tex[mat2_offset + texel_idx / 2] & 0xFF);
mat2_val = (texel_idx & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
rc += mat1_tex[texel_idx] * (scale * float(mat2_val) + zero);
uint mat2_val = (t_mat2[mat2_id + texel_pos / 2] & 0xFF);
mat2_val = (texel_pos & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
rc += mat1_tex[texel_pos] * (scale * float(mat2_val) + zero);
}
}
}
Expand Down
18 changes: 10 additions & 8 deletions backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ void check_q_matmul_args(
using namespace WHCN;
VK_CHECK_COND(graph.packed_dim_of(mat1) == kWidthDim);
VK_CHECK_COND(graph.packed_dim_of(mat2_data) == kWidthDim);
VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
// VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);

if (graph.storage_type_of(scales_and_zeros) == utils::kBuffer) {
VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
} else {
VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kChannelsDim);
}

if (graph.storage_type_of(out) == utils::kBuffer) {
VK_CHECK_COND(graph.packed_dim_of(out) == kWidthDim);
Expand Down Expand Up @@ -106,13 +112,8 @@ void add_q_matmul_node(
const ValueRef out) {
auto storage_type = graph.storage_type_of(out);

ValueRef mat2;

if (storage_type == utils::kBuffer) {
mat2 = prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
} else {
mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
}
ValueRef mat2 =
prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);

ValueRef scales_and_zeros =
prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked);
Expand All @@ -135,6 +136,7 @@ void add_q_matmul_node(
} else {
ubos.append(graph.sizes_ubo(out));
ubos.append(graph.sizes_ubo(mat1));
ubos.append(graph.strides_ubo(mat2));
ubos.append(graph.strides_ubo(scales_and_zeros));
}

Expand Down
18 changes: 14 additions & 4 deletions backends/vulkan/test/vulkan_compute_api_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2932,16 +2932,26 @@ void test_int4pack_mm(
int4mm_pack_weights(mat2_size, B_quant_data.data());

IOValueRef B_int4 =
graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, storage_type);
graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer);
graph.copy_into_staging(
B_int4.staging, B_int4_data.data(), B_int4_data.size());

const int k_groups = K / group_size;

// Random scales and zeroes. Keep scales small to avoid overflow and zeroes in
// int4 range
IOValueRef scales_and_zeros =
graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, storage_type);
IOValueRef scales_and_zeros;

if (storage_type == utils::kBuffer) {
scales_and_zeros.value = graph.add_tensor(
{2, N, k_groups}, vkapi::kFloat, storage_type, utils::kWidthPacked);
} else {
scales_and_zeros.value = graph.add_tensor(
{2, N, k_groups}, vkapi::kFloat, storage_type, utils::kChannelsPacked);
}

scales_and_zeros.staging = graph.set_input_tensor(scales_and_zeros.value);

std::vector<float> s_data(graph.numel_of(scales_and_zeros.value));
const int zeros_stride = s_data.size() / 2;
for (size_t i = 0; i < zeros_stride; i++) {
Expand Down Expand Up @@ -3003,7 +3013,7 @@ void test_int4pack_mm(
out_deq.staging, out_deq_data.data(), out_deq_data.size());

for (int i = 0; i < out_int4_data.size(); i++) {
CHECK_VALUE(out_int4_data, i, out_deq_data[i]);
EXPECT_TRUE(check_close(out_int4_data[i], out_deq_data[i]));
}
}

Expand Down

0 comments on commit 2eae7a9

Please sign in to comment.