Skip to content

[ET-VK] Adding all tensor packing support for repeat op. #9415

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,6 @@ def register_view_op(features: OpFeatures):
exir_ops.edge.aten.flip.default,
exir_ops.edge.aten.index_select.default,
exir_ops.edge.aten.select_copy.int,
# Tensor combination
exir_ops.edge.aten.repeat.default,
# Tensor creation
exir_ops.edge.aten.arange.start_step,
exir_ops.edge.aten.clone.default,
Expand Down Expand Up @@ -561,6 +559,7 @@ def register_ported_op(features: OpFeatures):
exir_ops.edge.aten.permute_copy.default,
# Tensor combination
exir_ops.edge.aten.cat.default,
exir_ops.edge.aten.repeat.default,
exir_ops.edge.aten.split_with_sizes_copy.default,
exir_ops.edge.aten.split.Tensor,
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

${layout_declare_spec_const(C, "bool", "repeat", "false")}
${layout_declare_spec_const(C, "int", "repeat", "0")}

void no_repeat_copy(ivec3 pos) {
// Position in input tensor
Expand Down Expand Up @@ -229,7 +229,7 @@ void main() {
return;
}

if (repeat) {
if (repeat == 1) {
repeat_copy(pos);
} else {
no_repeat_copy(pos);
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/impl/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ void add_copy_packed_dim_offset_node(
// Parameter buffers
{},
// Specialization Constants
{graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},
{graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0},
nullptr,
{},
{
Expand Down
147 changes: 14 additions & 133 deletions backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ void check_args(
const api::vTensor& in,
const std::vector<int64_t>& repeats,
const api::vTensor& out) {
VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
VK_CHECK_COND(check_same_packed_dim(in, out));

VK_CHECK_COND(in.storage_type() == out.storage_type());
if (in.storage_type() == utils::kTexture2D) {
Expand Down Expand Up @@ -59,147 +58,29 @@ void check_args(

} // namespace

void add_repeat_channel_node(
ComputeGraph& graph,
ValueRef in,
int64_t repeat_channel,
ValueRef out,
utils::ivec3& running_range) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

std::string kernel_name = "repeat_channel";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);

const std::vector<int64_t>& in_sizes = t_in->sizes();

int32_t in_width = utils::safe_downcast<int32_t>(dim_at<kWidth4D>(in_sizes));
int32_t in_height =
utils::safe_downcast<int32_t>(dim_at<kHeight4D>(in_sizes));
int32_t in_channel =
utils::safe_downcast<int32_t>(dim_at<kChannel4D>(in_sizes));
int32_t in_batch = utils::safe_downcast<int32_t>(dim_at<kBatch4D>(in_sizes));

int32_t out_channel = repeat_channel * in_channel;

utils::ivec4 out_whcn_sizes{in_width, in_height, out_channel, in_batch};

utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch};

// Channel packed global work ids
running_range[2] = out_whcn_sizes[3] * utils::div_up_4(out_whcn_sizes[2]);
utils::uvec3 global_size = utils::make_uvec3(running_range);
utils::uvec3 local_size = adaptive_work_group_size(global_size);

const struct Block final {
utils::ivec4 out_sizes;
utils::ivec4 in_size;
} repeat_channel_args{
out_whcn_sizes,
in_whcn_sizes,
};

auto shader = VK_KERNEL_FROM_STR(kernel_name);

graph.execute_nodes().emplace_back(new DispatchNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
local_size,
// Inputs and Outputs
{{out, vkapi::MemoryAccessType::WRITE},
{in, vkapi::MemoryAccessType::READ}},
// Parameter buffers
{graph.create_params_buffer(repeat_channel_args)},
// Specialization Constants
{SV(t_out->packed_dim())}));
}

void add_repeat_node(
ComputeGraph& graph,
ValueRef in,
ValueRef repeats_ref,
ValueRef out) {
std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
const std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));

vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);
check_args(*t_in, repeats, *t_out);

// In this function, we expand the dimensions in the following order:
// 1. Channel
// 2. Width
// 3. Height
// 4. Batch
// After expanding a dimension, we will update the "running_range" since we
// will need to copy the "expanded" area.

utils::ivec3 running_range = t_in->logical_limits();

const std::vector<int64_t>& in_sizes = t_in->sizes();

// Since we use channel packing, repeating the channel dimension is the most
// complicated and time-consuming, as we need to reason over misaligned
// channels. Hence we expand it first to minimize cost. Also, in this first
// dimension, we copy over the input texure to the output. In subsequent
// dimensions, we read and write from the same tensor.

if (int64_t channel_repeat = dim_at<kChannel4D>(repeats);
channel_repeat == 1) {
// If no repeat, short-cut to a direct copy
utils::ivec4 src_offset{0, 0, 0, 0};
utils::ivec4 dst_offset{0, 0, 0, 0};

add_copy_offset_node(
graph, in, running_range, src_offset, dst_offset, out, false, false);

} else {
add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
}

// TODO: refactor width, height, and batch into a common helper function.
// Width
if (int64_t width_repeat = dim_at<kWidth4D>(repeats); width_repeat > 1) {
utils::ivec4 src_offset{0, 0, 0, 0};

for (int i = 1; i < width_repeat; ++i) {
utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[0] = running_range[0] * width_repeat;
}

// Height
if (int64_t height_repeat = dim_at<kHeight4D>(repeats); height_repeat > 1) {
utils::ivec4 src_offset{0, 0, 0, 0};

for (int i = 1; i < height_repeat; ++i) {
utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[1] = running_range[1] * height_repeat;
}

// Batch
if (int64_t batch_repeat = dim_at<kBatch4D>(repeats); batch_repeat > 1) {
utils::ivec4 src_offset{0, 0, 0, 0};

for (int i = 1; i < batch_repeat; ++i) {
utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};

add_copy_offset_node(
graph, out, running_range, src_offset, dst_offset, out, true, false);
}

running_range[2] = running_range[2] * batch_repeat;
}
const utils::ivec4 src_offset{
dim_at<kWidth4D>(t_in->sizes()),
dim_at<kHeight4D>(t_in->sizes()),
dim_at<kChannel4D>(t_in->sizes()),
dim_at<kBatch4D>(t_in->sizes())};
const utils::ivec4 dst_offset{
dim_at<kWidth4D>(repeats),
dim_at<kHeight4D>(repeats),
dim_at<kChannel4D>(repeats),
dim_at<kBatch4D>(repeats)};
add_copy_packed_dim_offset_node(
graph, in, t_out->logical_limits(), src_offset, dst_offset, out, true);
}

void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {
Expand Down
12 changes: 10 additions & 2 deletions backends/vulkan/test/op_tests/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,11 @@ def get_repeat_inputs():
((2, 3), [3, 1, 4]),
]
)
test_suite_2d.layouts = ["utils::kChannelsPacked"]
test_suite_2d.layouts = [
"utils::kWidthPacked",
"utils::kHeightPacked",
"utils::kChannelsPacked",
]
test_suite_2d.storage_types = ["utils::kTexture2D"]
test_suite_2d.data_gen = "make_seq_tensor"
test_suite_2d.dtypes = ["at::kFloat"]
Expand Down Expand Up @@ -795,7 +799,11 @@ def get_repeat_inputs():
((2, 3), [3, 3, 2, 4]),
]
)
test_suite_3d.layouts = ["utils::kChannelsPacked"]
test_suite_3d.layouts = [
"utils::kWidthPacked",
"utils::kHeightPacked",
"utils::kChannelsPacked",
]
test_suite_3d.storage_types = ["utils::kTexture3D"]
test_suite_3d.data_gen = "make_seq_tensor"
test_suite_3d.dtypes = ["at::kFloat"]
Expand Down
Loading