pytorch · facebook-github-bot · Mar 27, 2025 · Mar 19, 2025 · Mar 24, 2025 · Mar 24, 2025
@@ -20,9 +20,17 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
 layout(push_constant) uniform restrict Block {
   ivec4 range;
+
+  // if not repeating
   // xyz is source offset w is channel size
+  // if repeating
+  // xyzw is source tensor sizes in WHCB dims respectively
   ivec4 src_offset;
+
+  // if not repeating
   // xyz is destination offset w is channel size
+  // if repeating
+  // xyzw is destination tensor sizes in WHCB dims respectively
   ivec4 dst_offset;
 };
 
@@ -37,13 +45,9 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
+${layout_declare_spec_const(C, "bool", "repeat", "false")}
 
+void no_repeat_copy(ivec3 pos) {
   // Position in input tensor
   ivec3 in_pos = pos + src_offset.xyz;
   in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
@@ -138,3 +142,103 @@ void main() {
     out_value,
     out_axis_map);
 }
+
+void repeat_copy(ivec3 pos) {
+  // expand position in packed dim
+  pos[packed_dim] <<= 2;
+
+  // channel size aligned by 4 when tensors are channel packed raw value otherwise
+  const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z);
+
+  // find input texel's WHCB index
+  const int width_index = pos.x % src_offset.x;
+  const int height_index = pos.y % src_offset.y;
+  int channel_index;
+  int batch_index;
+
+  // if tensors are channel packed
+  if (packed_dim == C_DIM) {
+    // the output channels in a batch will be channel size * channel repetitions aligned by 4
+    const int out_channel_size = alignup4(src_offset.z * dst_offset.z);
+
+    // batch index in the output
+    const int out_pos_batch_index = pos.z / out_channel_size;
+
+    // source batch index for based on current output pos
+    batch_index = out_pos_batch_index % src_offset.w;
+
+    // batch repetition count for current output pos
+    const int batch_repetition_index = out_pos_batch_index / src_offset.w;
+
+    // calculate input channel index based on current output pos and batch index
+    // its done this way because we want source channel to restart from zero when a batch index increments
+    // also batch_index will reset to zero after hitting batch repetition count
+    // so track the current repetition in batch_repetition_index so it can be used for determining current_index
+    channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z;
+  } else {
+    // the output channels in a batch will be channel size * channel repetitions
+    const int out_channel_size = src_offset.z * dst_offset.z;
+
+    // source batch index for based on current output pos
+    batch_index = (pos.z / out_channel_size) % src_offset.w;
+
+    // source channel index is current output pos wrapped based on channel count
+    channel_index = pos.z % src_offset.z;
+  }
+
+  // input texel's WCB position
+  const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
+
+  // squeeze position in packed dim
+  pos[packed_dim] >>= 2;
+
+  // packed dim index of texel last fetched
+  int fetched_in_pos_packed_dim = -1;
+
+  // fetched input texel
+  VEC4_T in_value;
+
+  // output texel value
+  VEC4_T out_value = VEC4_T(0);
+
+  int src_lane_offset = in_pos[packed_dim];
+
+  for (int i=0; i<4; i++) {
+    if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
+      fetched_in_pos_packed_dim = (src_lane_offset >> 2);
+
+      ivec3 curr_in_pos = in_pos;
+      curr_in_pos[packed_dim] = src_lane_offset;
+      curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
+      curr_in_pos[packed_dim] >>= 2;
+
+      in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map);
+    }
+
+    out_value[i] = in_value[src_lane_offset & 0x3];
+
+    src_lane_offset++;
+    // if packed index exceeded source packed dim round to zero
+    src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]);
+  }
+
+  write_texel_lpos(
+    t_out,
+    pos,
+    out_value,
+    out_axis_map);
+}
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, range.xyz))) {
+    return;
+  }
+
+  if (repeat) {
+    repeat_copy(pos);
+  } else {
+    no_repeat_copy(pos);
+  }
+}
@@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node(
     const ivec3& range,
     const ivec4& src_offset,
     const ivec4& dst_offset,
-    const ValueRef out) {
+    const ValueRef out,
+    bool repeat) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  // Check the packed dimension is same for both tensors, and if the packed
-  // dimension is Width or Height. Since the function does not support channel
-  // packing.
-  VK_CHECK_COND(
-      check_same_packed_dim(*t_in, *t_out) &&
-      (check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
-       check_packed_dim_is(*t_in, WHCN::kHeightDim)));
+  // Check the packed dimension is same for both tensors
+  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
+  if (!repeat) {
+    // For non repeat copy also check if the packed dimension is Width or
+    // Height. Since the function does not support channel packing.
+    VK_CHECK_COND(
+        check_same_packed_dim(*t_in, *t_out) &&
+        (check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
+         check_packed_dim_is(*t_in, WHCN::kHeightDim)));
+  }
 
   std::string kernel_name = "copy_packed_dim_offset";
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  const auto packed_dim = t_in->packed_dim();
   // A copy of range with the last element set to batch size of the input tensor
   ivec4 final_range = {
       range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
   ivec3 global_wg_size = t_out->logical_limits();
-  // The starting offset in a texel where this tensor will start copying from
-  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
-  // The starting offset in a texel where this tensor will start copying to
-  const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
-
-  // The total packed texels this tensor will be copied from
-  // The first texel of tensor data in packed dimension will be copied from
-  // remaining lanes from current source Hence (4 - src_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto src_packed_size = utils::div_up_4(
-      (4 - src_lane_offset) +
-      dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
-
-  // The total packed texels this tensor will be copied to
-  // The first texel of tensor data in packed dimension will be copied to
-  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
-  // tensor size in packed dimension
-  const auto dst_packed_size = utils::div_up_4(
-      (4 - dst_lane_offset) +
-      dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
-
-  // If the starting src offset is not 0, and the total packed texels is greater
-  // than the source texel range
-  const bool has_additional_src_work =
-      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
-  // If the starting dst offset is not 0, and the total packed texels is greater
-  // than the source texel range
-  const bool has_additional_dst_work =
-      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
-
-  if (has_additional_src_work || has_additional_dst_work) {
-    global_wg_size[packed_dim]++; // Increase the global work group size in
-                                  // packed dimension
-    final_range[packed_dim]++; // Increase the range in packed dimension
+
+  if (!repeat) {
+    const auto packed_dim = t_in->packed_dim();
+    // The starting offset in a texel where this tensor will start copying from
+    const auto src_lane_offset = src_offset[packed_dim] & 0x3;
+    // The starting offset in a texel where this tensor will start copying to
+    const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
+
+    // The total packed texels this tensor will be copied from
+    // The first texel of tensor data in packed dimension will be copied from
+    // remaining lanes from current source Hence (4 - src_lane_offset) is added
+    // to tensor size in packed dimension
+    const auto src_packed_size = utils::div_up_4(
+        (4 - src_lane_offset) +
+        dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
+
+    // The total packed texels this tensor will be copied to
+    // The first texel of tensor data in packed dimension will be copied to
+    // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
+    // to tensor size in packed dimension
+    const auto dst_packed_size = utils::div_up_4(
+        (4 - dst_lane_offset) +
+        dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
+
+    // If the starting src offset is not 0, and the total packed texels is
+    // greater than the source texel range
+    const bool has_additional_src_work =
+        src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
+    // If the starting dst offset is not 0, and the total packed texels is
+    // greater than the source texel range
+    const bool has_additional_dst_work =
+        dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
+
+    if (has_additional_src_work || has_additional_dst_work) {
+      global_wg_size[packed_dim]++; // Increase the global work group size in
+                                    // packed dimension
+      final_range[packed_dim]++; // Increase the range in packed dimension
+    }
   }
 
   auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node(
       // Parameter buffers
       {},
       // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
+      {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},
       nullptr,
       {},
       {

@@ -53,13 +53,16 @@ void add_copy_offset_node(
 // dst_offset (all are in texture coordinate (x, y, z) from the input image to
 // the output image.
 //
+// repeat flag is used to indicate if copy should wrap around tensor dim.
+// only true for repeat op.
 void add_copy_packed_dim_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
     const utils::ivec3& range,
     const utils::ivec4& src_offset,
     const utils::ivec4& dst_offset,
-    const ValueRef out);
+    const ValueRef out,
+    bool repeat = false);
 
 // add_copy_channel_offset_node behaves similar to add_copy_node, except that it
 // works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).