Skip to content

Commit

Permalink
Support int8 texture tensors without requiring int8 buffers (#4485)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #4485

## Context

By default, storage buffers in Vulkan must contain 32 bit data types; using 8 bit and 16 bit data types in buffers can be enabled optionally by supporting the [VK_KHR_8bit_storage](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_8bit_storage.html) extension or the [VK_KHR_16bit_storage](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_16bit_storage.html) extension respectively.

Previously, 8-bit and 16-bit tensors were enabled by using those extensions; however, this meant that 8-bit and 16-bit tensors could not be used if the Vulkan driver does not support the corresponding extension.

This diff adds support for 8-bit texture-backed tensors without the need for the VK_KHR_8bit_storage extension. This is done by introducing shaders that manually pack and repack 4 8-bit integers into a single int32 value. Once the tensor data has been transferred to an image texture (which will use the `VK_FORMAT_R8G8B8A8_SINT` image format) the extension will no longer be required.

Reviewed By: jorgep31415

Differential Revision: D60536832

fbshipit-source-id: 8d3d8b069582ab8c18d41701c864778621d2f6e3
  • Loading branch information
SS-JIA authored and facebook-github-bot committed Aug 2, 2024
1 parent 4483bb6 commit 448c7d3
Show file tree
Hide file tree
Showing 16 changed files with 320 additions and 73 deletions.
18 changes: 9 additions & 9 deletions backends/vulkan/runtime/graph/ComputeGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,24 +319,20 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
return image_extents_of(idx);
}

utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
utils::uvec3 ComputeGraph::create_local_wg_size(
const utils::uvec3 global_wg_size) {
if (config_.enable_local_wg_size_override) {
return config_.local_wg_size_override;
}

if (is_buffer_storage(idx)) {
return {64u, 1u, 1u};
}

const utils::uvec3 image_extents = image_extents_of(idx);
utils::uvec3 local_group_size = {4, 4, 4};

if (image_extents.data[2u] == 1) {
if (image_extents.data[1u] == 1) {
if (global_wg_size.data[2u] == 1) {
if (global_wg_size.data[1u] == 1) {
local_group_size.data[0u] = 64;
local_group_size.data[1u] = 1;
local_group_size.data[2u] = 1;
} else if (image_extents.data[1u] < 8) {
} else if (global_wg_size.data[1u] < 8) {
local_group_size.data[0u] = 16;
local_group_size.data[1u] = 4;
local_group_size.data[2u] = 1;
Expand All @@ -349,6 +345,10 @@ utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
return local_group_size;
}

utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
return create_local_wg_size(image_extents_of(idx));
}

void ComputeGraph::copy_into_staging(
const ValueRef idx,
const void* data,
Expand Down
28 changes: 24 additions & 4 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,9 @@ class ComputeGraph final {
return values_.at(idx).type();
}

// Get Tensor Property
//
// Tensor Properties Accessors
//

std::vector<int64_t> sizes_of(const ValueRef idx) const;

Expand Down Expand Up @@ -226,7 +228,9 @@ class ComputeGraph final {
return values_.at(idx).toTensor().ntexels_ubo();
}

//
// Scalar Value Extraction
//

template <typename T>
T extract_scalar(const ValueRef idx) {
Expand Down Expand Up @@ -459,16 +463,21 @@ class ComputeGraph final {
utils::uvec3 create_global_wg_size(const ValueRef idx);

/*
* Suggest a local workgroup size for a given `api::vTensor` value, assuming
* that every shader invocation calculates one texel element of the output
* tensor.
* Suggest a local workgroup size for a given global workgroup size.
*
* The local workgroup size will be formed to try and minimize the number of
* inactive invocations.
*
* Currently, the local workgroup size is hard-coded to contain a total of 64
* shader invocations. In the future, this value can be configured.
*/
utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);

/*
* Convenience function to suggest a local workgroup size for a given
* `api::vTensor` value, assuming that every shader invocation calculates one
* texel element of the output tensor.
*/
utils::uvec3 create_local_wg_size(const ValueRef idx);

//
Expand Down Expand Up @@ -500,6 +509,17 @@ class ComputeGraph final {
void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
void propagate_resize();

//
// Miscellaneous Utilities
//

/*
* Check whether the GPU supports 8 bit buffers.
*/
inline bool int8_buffers_enabled() const {
return context_->adapter_ptr()->has_full_int8_buffers_support();
}

//
// Debug support (implemented in Logging.cpp)
//
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
* Returns: The (x, y, z, n) texel position corresponding to the first element
* of the texel at the specified buffer index
*/
ivec4 to_texel_pos(int buf_i, ivec4 strides, int packed_dim) {
ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
ivec4 idx;
for (int i = 3; i >= 0; i--) {
if (i != packed_dim) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#include "indexing_utils.h"

layout(std430) buffer;

#extension GL_EXT_control_flow_attributes : require

${layout_declare_tensor(0, "r", "t_in", "int8", "texture3d")}
${layout_declare_buffer(1, "w", "nchw_out", "int")}
${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
${layout_declare_ubo(3, "int", "out_ntexels")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

void main() {
const int out_buf_idx = int(gl_GlobalInvocationID.x);
if (out_buf_idx >= out_ntexels) {
return;
}

ivec4 values;
int in_buf_idx = 4 * out_buf_idx;

[[unroll]] for (int i = 0; i < 4; ++i) {
const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
const ivec4 texture_pos = to_texture_elem_pos(
tensor_idx, tensor_sizes, packed_dim);
values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
in_buf_idx++;
}

// Manually pack 4x 8-bit integers into a 32 bit integer. Note that little
// endian is assumed, since most processors use little endian. Thus the
// "later" values are placed in most significant bytes.
int packed = ((values[3] & 0xFF) << 24)
| ((values[2] & 0xFF) << 16)
| ((values[1] & 0xFF) << 8)
| ((values[0] & 0xFF));

nchw_out[out_buf_idx] = packed;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#include "indexing_utils.h"

layout(std430) buffer;

#extension GL_EXT_control_flow_attributes : require

${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
${layout_declare_buffer(1, "r", "nchw_in", "int")}
${layout_declare_ubo(2, "ivec4", "tensor_sizes")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

/*
* Extends sign of int8
*/
int extend_sign(int x) {
if (x >> 7 == 1) {
return x | 0xFFFFFF00;
}
return x;
}

ivec4 read_texel(ivec4 tensor_idx) {
const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
tensor_idx, tensor_sizes, packed_dim);

int shift = (1 << 8) - 1;
ivec4 masks;
// Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
// little endian is assumed, as most processors use little endian. Thus the
// most significant bytes correspond to the "latter" packed values.
masks.x = shift << (8 * (buf_indices.x % 4));
masks.y = shift << (8 * (buf_indices.y % 4));
masks.z = shift << (8 * (buf_indices.z % 4));
masks.w = shift << (8 * (buf_indices.w % 4));

ivec4 out_tex = ivec4(0);

[[unroll]] for (int i = 0; i < 4; ++i) {
if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
int in_texel = nchw_in[buf_indices[i] / 4];
int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
extracted_val = extend_sign(extracted_val);
out_tex[i] = extracted_val;
}
}

return out_tex;
}

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);

if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
return;
}

write_texel(t_out, pos, read_texel(tensor_idx));
}
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void main() {
return;
}

ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
tensor_idx[packed_dim] *= 4;
t_out[t_id] = read_texel(tensor_idx);
}
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void main() {
return;
}

const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0);
const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);

VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
write_texel(t_out, t_id, outtex);
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ void main() {
}

const VEC4_T intex = t_in[t_id];
ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
tensor_idx[packed_dim] *= 4;
write_out_texel(intex, tensor_idx);
}
Expand Down
31 changes: 24 additions & 7 deletions backends/vulkan/runtime/graph/ops/impl/Staging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ void add_staging_to_tensor_node(
const ValueRef out_tensor) {
VK_CHECK_COND(graph.val_is_staging(in_staging));

vkapi::ShaderInfo shader =
get_nchw_to_tensor_shader(*graph.get_tensor(out_tensor));
vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
*graph.get_tensor(out_tensor), graph.int8_buffers_enabled());

vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)});
if (graph.is_buffer_storage(out_tensor)) {
Expand Down Expand Up @@ -55,10 +55,26 @@ void add_tensor_to_staging_node(
const ValueRef out_staging) {
VK_CHECK_COND(graph.val_is_staging(out_staging));

vkapi::ShaderInfo shader =
get_tensor_to_nchw_shader(*graph.get_tensor(in_tensor));
vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
*graph.get_tensor(in_tensor), graph.int8_buffers_enabled());

utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)});

// Normally, the tensor_to_nchw shader is structured so that each thread reads
// one texel from the input texture and writes each component of the texel
// into the corresponding location in the output buffer. However, this shader
// is structured slightly differently in that each thread writes out a
// complete 32 bit integer (containing 4 packed 8-bit integers) into the
// output buffer. Therefore, the global work group size for this shader will
// be the number of elements in the output buffer divided by 4, as opposed to
// the extents of the input texture.
if (shader.kernel_name == "int8_tensor_to_nchw_noint8") {
uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
global_wg_size = {buffer_len, 1, 1};
ubos.append({graph.ntexels_ubo(in_tensor)});
}

if (graph.is_buffer_storage(in_tensor)) {
ubos.append({
graph.texel_strides_ubo(in_tensor),
Expand All @@ -69,8 +85,8 @@ void add_tensor_to_staging_node(
graph.execute_nodes().emplace_back(new ExecuteNode(
graph,
shader,
graph.create_global_wg_size(in_tensor),
graph.create_local_wg_size(in_tensor),
global_wg_size,
graph.create_local_wg_size(global_wg_size),
// Input and Outputs
{{in_tensor, vkapi::MemoryAccessType::READ},
{out_staging, vkapi::MemoryAccessType::WRITE}},
Expand All @@ -86,7 +102,8 @@ ValueRef prepack(
const utils::GPUMemoryLayout layout) {
ValueRef v = graph.add_tensor_like(vref, layout);

vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(*graph.get_tensor(v));
vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
*graph.get_tensor(v), graph.int8_buffers_enabled());

vkapi::ParamsBindList ubos({graph.sizes_ubo(v)});
if (graph.is_buffer_storage(v)) {
Expand Down
18 changes: 16 additions & 2 deletions backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,21 +95,35 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
memset(data_ptr, 0, staging.nbytes());
}

vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst) {
vkapi::ShaderInfo get_nchw_to_tensor_shader(
const api::vTensor& v_dst,
const bool int8_buffer_enabled) {
std::string kernel_name;
kernel_name.reserve(kShaderNameReserve);

if (v_dst.dtype() == vkapi::kChar &&
v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
return VK_KERNEL(nchw_to_int8_tensor_noint8);
}

kernel_name = "nchw_to_tensor";
add_dtype_suffix(kernel_name, v_dst);
add_storage_type_suffix(kernel_name, v_dst);

return VK_KERNEL_FROM_STR(kernel_name);
}

vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src) {
vkapi::ShaderInfo get_tensor_to_nchw_shader(
const api::vTensor& v_src,
bool int8_buffer_enabled) {
std::string kernel_name;
kernel_name.reserve(kShaderNameReserve);

if (v_src.dtype() == vkapi::kChar &&
v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
return VK_KERNEL(int8_tensor_to_nchw_noint8);
}

kernel_name = "tensor_to_nchw";
add_dtype_suffix(kernel_name, v_src);
add_storage_type_suffix(kernel_name, v_src);
Expand Down
8 changes: 6 additions & 2 deletions backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
// Functions to get shaders
//

vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst);
vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src);
vkapi::ShaderInfo get_nchw_to_tensor_shader(
const api::vTensor& v_dst,
bool int8_buffer_enabled = true);
vkapi::ShaderInfo get_tensor_to_nchw_shader(
const api::vTensor& v_src,
bool int8_buffer_enabled = true);

} // namespace vkcompute
Loading

0 comments on commit 448c7d3

Please sign in to comment.