Skip to content

[ET-VK] Adding repeat support to add_copy_packed_dim_offset_node function. #9414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 27, 2025
116 changes: 110 additions & 6 deletions backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,17 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}

layout(push_constant) uniform restrict Block {
ivec4 range;

// if not repeating
// xyz is source offset w is channel size
// if repeating
// xyzw is source tensor sizes in WHCB dims respectively
ivec4 src_offset;

// if not repeating
// xyz is destination offset w is channel size
// if repeating
// xyzw is destination tensor sizes in WHCB dims respectively
ivec4 dst_offset;
};

Expand All @@ -37,13 +45,9 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range.xyz))) {
return;
}
${layout_declare_spec_const(C, "bool", "repeat", "false")}

void no_repeat_copy(ivec3 pos) {
// Position in input tensor
ivec3 in_pos = pos + src_offset.xyz;
in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
Expand Down Expand Up @@ -138,3 +142,103 @@ void main() {
out_value,
out_axis_map);
}

void repeat_copy(ivec3 pos) {
// expand position in packed dim
pos[packed_dim] <<= 2;

// channel size aligned by 4 when tensors are channel packed raw value otherwise
const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z);

// find input texel's WHCB index
const int width_index = pos.x % src_offset.x;
const int height_index = pos.y % src_offset.y;
int channel_index;
int batch_index;

// if tensors are channel packed
if (packed_dim == C_DIM) {
// the output channels in a batch will be channel size * channel repetitions aligned by 4
const int out_channel_size = alignup4(src_offset.z * dst_offset.z);

// batch index in the output
const int out_pos_batch_index = pos.z / out_channel_size;

// source batch index for based on current output pos
batch_index = out_pos_batch_index % src_offset.w;

// batch repetition count for current output pos
const int batch_repetition_index = out_pos_batch_index / src_offset.w;

// calculate input channel index based on current output pos and batch index
// its done this way because we want source channel to restart from zero when a batch index increments
// also batch_index will reset to zero after hitting batch repetition count
// so track the current repetition in batch_repetition_index so it can be used for determining current_index
channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z;
} else {
// the output channels in a batch will be channel size * channel repetitions
const int out_channel_size = src_offset.z * dst_offset.z;

// source batch index for based on current output pos
batch_index = (pos.z / out_channel_size) % src_offset.w;

// source channel index is current output pos wrapped based on channel count
channel_index = pos.z % src_offset.z;
}

// input texel's WCB position
const ivec3 in_pos = ivec3(width_index, height_index, channel_index);

// squeeze position in packed dim
pos[packed_dim] >>= 2;

// packed dim index of texel last fetched
int fetched_in_pos_packed_dim = -1;

// fetched input texel
VEC4_T in_value;

// output texel value
VEC4_T out_value = VEC4_T(0);

int src_lane_offset = in_pos[packed_dim];

for (int i=0; i<4; i++) {
if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
fetched_in_pos_packed_dim = (src_lane_offset >> 2);

ivec3 curr_in_pos = in_pos;
curr_in_pos[packed_dim] = src_lane_offset;
curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
curr_in_pos[packed_dim] >>= 2;

in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map);
}

out_value[i] = in_value[src_lane_offset & 0x3];

src_lane_offset++;
// if packed index exceeded source packed dim round to zero
src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]);
}

write_texel_lpos(
t_out,
pos,
out_value,
out_axis_map);
}

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range.xyz))) {
return;
}

if (repeat) {
repeat_copy(pos);
} else {
no_repeat_copy(pos);
}
}
95 changes: 51 additions & 44 deletions backends/vulkan/runtime/graph/ops/impl/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node(
const ivec3& range,
const ivec4& src_offset,
const ivec4& dst_offset,
const ValueRef out) {
const ValueRef out,
bool repeat) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

// Check the packed dimension is same for both tensors, and if the packed
// dimension is Width or Height. Since the function does not support channel
// packing.
VK_CHECK_COND(
check_same_packed_dim(*t_in, *t_out) &&
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
// Check the packed dimension is same for both tensors
VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
if (!repeat) {
// For non repeat copy also check if the packed dimension is Width or
// Height. Since the function does not support channel packing.
VK_CHECK_COND(
check_same_packed_dim(*t_in, *t_out) &&
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
}

std::string kernel_name = "copy_packed_dim_offset";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);

const auto packed_dim = t_in->packed_dim();
// A copy of range with the last element set to batch size of the input tensor
ivec4 final_range = {
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
ivec3 global_wg_size = t_out->logical_limits();
// The starting offset in a texel where this tensor will start copying from
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
// The starting offset in a texel where this tensor will start copying to
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

// The total packed texels this tensor will be copied from
// The first texel of tensor data in packed dimension will be copied from
// remaining lanes from current source Hence (4 - src_lane_offset) is added
// to tensor size in packed dimension
const auto src_packed_size = utils::div_up_4(
(4 - src_lane_offset) +
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

// The total packed texels this tensor will be copied to
// The first texel of tensor data in packed dimension will be copied to
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
// tensor size in packed dimension
const auto dst_packed_size = utils::div_up_4(
(4 - dst_lane_offset) +
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

// If the starting src offset is not 0, and the total packed texels is greater
// than the source texel range
const bool has_additional_src_work =
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
// If the starting dst offset is not 0, and the total packed texels is greater
// than the source texel range
const bool has_additional_dst_work =
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

if (has_additional_src_work || has_additional_dst_work) {
global_wg_size[packed_dim]++; // Increase the global work group size in
// packed dimension
final_range[packed_dim]++; // Increase the range in packed dimension

if (!repeat) {
const auto packed_dim = t_in->packed_dim();
// The starting offset in a texel where this tensor will start copying from
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
// The starting offset in a texel where this tensor will start copying to
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

// The total packed texels this tensor will be copied from
// The first texel of tensor data in packed dimension will be copied from
// remaining lanes from current source Hence (4 - src_lane_offset) is added
// to tensor size in packed dimension
const auto src_packed_size = utils::div_up_4(
(4 - src_lane_offset) +
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

// The total packed texels this tensor will be copied to
// The first texel of tensor data in packed dimension will be copied to
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added
// to tensor size in packed dimension
const auto dst_packed_size = utils::div_up_4(
(4 - dst_lane_offset) +
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

// If the starting src offset is not 0, and the total packed texels is
// greater than the source texel range
const bool has_additional_src_work =
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
// If the starting dst offset is not 0, and the total packed texels is
// greater than the source texel range
const bool has_additional_dst_work =
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

if (has_additional_src_work || has_additional_dst_work) {
global_wg_size[packed_dim]++; // Increase the global work group size in
// packed dimension
final_range[packed_dim]++; // Increase the range in packed dimension
}
}

auto shader = VK_KERNEL_FROM_STR(kernel_name);
Expand All @@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node(
// Parameter buffers
{},
// Specialization Constants
{graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
{graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},
nullptr,
{},
{
Expand Down
5 changes: 4 additions & 1 deletion backends/vulkan/runtime/graph/ops/impl/Copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,16 @@ void add_copy_offset_node(
// dst_offset (all are in texture coordinate (x, y, z) from the input image to
// the output image.
//
// repeat flag is used to indicate if copy should wrap around tensor dim.
// only true for repeat op.
void add_copy_packed_dim_offset_node(
ComputeGraph& graph,
const ValueRef in,
const utils::ivec3& range,
const utils::ivec4& src_offset,
const utils::ivec4& dst_offset,
const ValueRef out);
const ValueRef out,
bool repeat = false);

// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
Expand Down
Loading