Skip to content

[ET-VK] Integrate axis mapping into naive matrix multiplication shaders #5277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 71 additions & 47 deletions backends/vulkan/runtime/api/containers/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ std::vector<int64_t> calculate_strides(
* tensor. Thus the axis mapping can be considered to be in WHCN dimension
* order.
*
* The last value `axis_mapping.at(3)` indicates the WHCN index of the tensor
* The last value `axis_map.at(3)` indicates the WHCN index of the tensor
* dimension along which batches will be concatenated. This dimension can be
* referred to as the "inner dimension" To determine which image texture axis is
* used for the concatenation, a double lookup will need to be performed
* (axis_mapping.at(axis_mapping.at(3))).
* (axis_map.at(axis_map.at(3))).
*
* The reason for strucuring axis mapping this way is because for the batch dim,
* two things need to be easily derived:
Expand All @@ -107,7 +107,7 @@ std::vector<int64_t> calculate_strides(
*
* The axis mapping allows for permuted views of texture-backed tensors.
*/
std::vector<int64_t> default_axis_mapping() {
std::vector<int64_t> default_axis_map() {
// Currently, all compute shaders have an assumption that the channels dim is
// used to combine with the batch dim of a tensor. However, once dim mapping
// is integrated into the tensor indexing logic for each compute shader, we
Expand Down Expand Up @@ -173,40 +173,40 @@ std::vector<int64_t> calculate_padded_sizes(

utils::uvec3 calculate_image_extents(
const std::vector<int64_t>& padded_sizes,
const std::vector<int64_t>& axis_mapping,
const std::vector<int64_t>& axis_map,
const utils::GPUMemoryLayout memory_layout) {
VK_CHECK_COND(padded_sizes.size() == 4);
VK_CHECK_COND(axis_mapping.size() == 4);
VK_CHECK_COND(axis_map.size() == 4);

utils::uvec3 extents({1, 1, 1});
// First three elements of axis_mapping indicate which (X,Y,Z) image axis the
// First three elements of axis_map indicate which (X,Y,Z) image axis the
// width, height, and channels dim of the tensor maps to.
for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
const int64_t axis = axis_mapping.at(whcn_dim);
const int64_t axis = axis_map.at(whcn_dim);
const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
}

// axis_mapping[3] indicates the WHCN index of the dimension used for batch
// axis_map[3] indicates the WHCN index of the dimension used for batch
// concatenation. Thus a double lookup is required to determine the image axis
// used for batch concatenation.
const int64_t concatted_whcn_dim = axis_mapping.at(3);
const int64_t batch_axis = axis_mapping.at(concatted_whcn_dim);
const int64_t concatted_whcn_dim = axis_map.at(3);
const int64_t batch_axis = axis_map.at(concatted_whcn_dim);
// Multiply the extents of the batch axis by the batch size.
extents[batch_axis] *= padded_sizes.at(0);

switch (memory_layout) {
case utils::kWidthPacked:
VK_CHECK_COND(extents[0] % 4 == 0);
extents[0] /= 4;
VK_CHECK_COND(extents[axis_map.at(0)] % 4 == 0);
extents[axis_map.at(0)] /= 4;
break;
case utils::kHeightPacked:
VK_CHECK_COND(extents[1] % 4 == 0);
extents[1] /= 4;
VK_CHECK_COND(extents[axis_map.at(1)] % 4 == 0);
extents[axis_map.at(1)] /= 4;
break;
case utils::kChannelsPacked:
VK_CHECK_COND(extents[2] % 4 == 0);
extents[2] /= 4;
VK_CHECK_COND(extents[axis_map.at(2)] % 4 == 0);
extents[axis_map.at(2)] /= 4;
break;
}

Expand All @@ -229,25 +229,27 @@ vTensor::vTensor(
// Calculate tensor metadata
sizes_(sizes.begin(), sizes.end()),
dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
axis_mapping_(default_axis_mapping()),
axis_map_(default_axis_map()),
strides_(calculate_strides(sizes, dim_order_)),
numel_(utils::multiply_integers(sizes_)),
padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
padded_numel_(utils::multiply_integers(padded_sizes_)),
texture_limits_{{0, 0, 0}},
logical_limits_{{0, 0, 0}},
// Utility Uniform Buffers that can be passed to shaders as arguments
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
axis_mapping_uniform_(),
axis_map_uniform_(),
texture_limits_uniform_(),
logical_limits_uniform_(),
// Construct Tensor storage
storage_(
context,
storage_type,
memory_layout_,
axis_mapping_,
axis_map_,
padded_sizes_,
dtype_,
allocate_memory) {
Expand All @@ -259,6 +261,8 @@ vTensor::vTensor(
utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
utils::safe_downcast<int32_t>(storage_.image_extents_[1]),
utils::safe_downcast<int32_t>(storage_.image_extents_[2])};

update_logical_limits();
}

if (dtype == vkapi::kHalf) {
Expand All @@ -275,7 +279,7 @@ vTensor::vTensor(const vTensor& other)
// Copy tensor size metadata
sizes_(other.sizes_.begin(), other.sizes_.end()),
dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
axis_mapping_(other.axis_mapping_.begin(), other.axis_mapping_.end()),
axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
strides_(other.strides_.begin(), other.strides_.end()),
numel_(other.numel_),
padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
Expand All @@ -284,12 +288,14 @@ vTensor::vTensor(const vTensor& other)
other.unsqueezed_strides_.end()},
padded_numel_(other.padded_numel_),
texture_limits_{other.texture_limits_},
logical_limits_{other.logical_limits_},
// Empty initialize Utility Uniform Buffers
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
axis_mapping_uniform_(),
axis_map_uniform_(),
texture_limits_uniform_(),
logical_limits_uniform_(),
// Copy Tensor storage
storage_(other.storage_) {}

Expand All @@ -303,19 +309,21 @@ vTensor::vTensor(
// Copy tensor size metadata
sizes_(sizes.begin(), sizes.end()),
dim_order_(dim_order.begin(), dim_order.end()),
axis_mapping_(default_axis_mapping()),
axis_map_(default_axis_map()),
strides_(calculate_strides(sizes_, dim_order_)),
numel_(utils::multiply_integers(sizes_)),
padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
padded_numel_(utils::multiply_integers(padded_sizes_)),
texture_limits_{{0, 0, 0}},
texture_limits_{other.texture_limits_},
logical_limits_(other.logical_limits_),
// Empty initialize Utility Uniform Buffers
sizes_uniform_(),
strides_uniform_(),
numel_uniform_(),
axis_mapping_uniform_(),
axis_map_uniform_(),
texture_limits_uniform_(),
logical_limits_uniform_(),
// Copy Tensor storage
storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
VK_CHECK_COND(
Expand Down Expand Up @@ -356,12 +364,18 @@ vkapi::VulkanBuffer& vTensor::buffer(
return storage_.buffer_;
}

utils::uvec3 vTensor::mapped_extents() const {
utils::uvec3 m_extents;
m_extents[0] = storage_.image_extents_[axis_mapping_.at(0)];
m_extents[1] = storage_.image_extents_[axis_mapping_.at(1)];
m_extents[2] = storage_.image_extents_[axis_mapping_.at(2)];
return m_extents;
void vTensor::update_logical_limits() {
logical_limits_.limits[0] = texture_limits_.limits[axis_map_.at(0)];
logical_limits_.limits[1] = texture_limits_.limits[axis_map_.at(1)];
logical_limits_.limits[2] = texture_limits_.limits[axis_map_.at(2)];
}

utils::uvec3 vTensor::logical_extents() const {
utils::uvec3 logical_extents(
{utils::safe_downcast<uint32_t>(logical_limits_.limits[0]),
utils::safe_downcast<uint32_t>(logical_limits_.limits[1]),
utils::safe_downcast<uint32_t>(logical_limits_.limits[2])});
return logical_extents;
}

const vkapi::BufferBindInfo vTensor::sizes_ubo() {
Expand All @@ -380,12 +394,12 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
return vkapi::BufferBindInfo(strides_uniform_.buffer());
}

const vkapi::BufferBindInfo vTensor::axis_mapping_ubo() {
if (!axis_mapping_uniform_.buffer()) {
axis_mapping_uniform_ =
ParamsBuffer(storage_.context_, utils::make_ivec4(axis_mapping_));
const vkapi::BufferBindInfo vTensor::axis_map_ubo() {
if (!axis_map_uniform_.buffer()) {
axis_map_uniform_ =
ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_));
}
return vkapi::BufferBindInfo(axis_mapping_uniform_.buffer());
return vkapi::BufferBindInfo(axis_map_uniform_.buffer());
}

const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
Expand All @@ -395,6 +409,13 @@ const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
return vkapi::BufferBindInfo(texture_limits_uniform_.buffer());
}

const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
if (!logical_limits_uniform_.buffer()) {
logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
}
return vkapi::BufferBindInfo(logical_limits_uniform_.buffer());
}

const vkapi::BufferBindInfo vTensor::numel_ubo() {
if (!numel_uniform_.buffer()) {
numel_uniform_ = ParamsBuffer(storage_.context_, numel_);
Expand Down Expand Up @@ -465,14 +486,16 @@ void vTensor::update_metadata() {
// Calculate the extents of the image texture that would have been required
// for a tensor of the new sizes.
utils::uvec3 virtual_extents =
calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);

// Update the texture limits to reflect the new virtual extents.
texture_limits_.limits = utils::ivec3{
utils::safe_downcast<int32_t>(virtual_extents[0]),
utils::safe_downcast<int32_t>(virtual_extents[1]),
utils::safe_downcast<int32_t>(virtual_extents[2])};

update_logical_limits();

if (sizes_uniform_.buffer()) {
sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
}
Expand All @@ -482,20 +505,23 @@ void vTensor::update_metadata() {
if (numel_uniform_.buffer()) {
numel_uniform_.update(numel_);
}
if (axis_mapping_uniform_.buffer()) {
axis_mapping_uniform_.update(utils::make_ivec4(axis_mapping_));
if (axis_map_uniform_.buffer()) {
axis_map_uniform_.update(utils::make_ivec4(axis_map_));
}
if (texture_limits_uniform_.buffer()) {
texture_limits_uniform_.update(texture_limits_);
}
if (logical_limits_uniform_.buffer()) {
logical_limits_uniform_.update(logical_limits_);
}
}

void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
if (storage_type() != utils::kBuffer) {
// For texture storage check that the current texture is large enough for
// the new sizes of the tensor.
utils::uvec3 virtual_extents =
calculate_image_extents(padded_sizes_, axis_mapping_, memory_layout_);
calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);

bool valid_resize = virtual_extents[0] <= image_extents()[0];
valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
Expand Down Expand Up @@ -546,7 +572,7 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
update_metadata();
storage_.discard_and_reallocate(
calculate_padded_sizes(new_sizes, memory_layout_),
axis_mapping_,
axis_map_,
memory_layout_,
dtype_);
}
Expand Down Expand Up @@ -624,16 +650,14 @@ vTensorStorage::vTensorStorage(
Context* const context,
const utils::StorageType storage_type,
const utils::GPUMemoryLayout gpu_memory_layout,
const std::vector<int64_t>& axis_mapping,
const std::vector<int64_t>& axis_map,
const std::vector<int64_t>& padded_sizes,
const vkapi::ScalarType dtype,
const bool allocate_memory)
: context_(context),
storage_type_{storage_type},
image_extents_(calculate_image_extents(
padded_sizes,
axis_mapping,
gpu_memory_layout)),
image_extents_(
calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout)),
buffer_length_{utils::multiply_integers(padded_sizes)},
buffer_offset_{0},
image_(allocate_image(
Expand Down Expand Up @@ -746,7 +770,7 @@ bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {

void vTensorStorage::discard_and_reallocate(
const std::vector<int64_t>& padded_sizes,
const std::vector<int64_t>& axis_mapping,
const std::vector<int64_t>& axis_map,
const utils::GPUMemoryLayout gpu_memory_layout,
const vkapi::ScalarType dtype) {
const bool image_owns_memory = image_.owns_memory();
Expand All @@ -755,7 +779,7 @@ void vTensorStorage::discard_and_reallocate(
flush();

image_extents_ =
calculate_image_extents(padded_sizes, axis_mapping, gpu_memory_layout);
calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout);
image_ = allocate_image(
context_,
image_extents_,
Expand Down
Loading
Loading