diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index b1950f970e..9d1f6c3bd9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -40,7 +40,12 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
-  const u16vec3 gpos = u16vec3(gl_GlobalInvocationID);
+  const uint16_t out_limits_y_scaled = uint16_t((out_limits.y + TILE_SIZE - 1) / TILE_SIZE);
+
+  const u16vec3 gpos = u16vec3(
+    gl_GlobalInvocationID.x / (out_limits_y_scaled * out_limits.z),
+    (gl_GlobalInvocationID.x / out_limits.z) % out_limits_y_scaled,
+    gl_GlobalInvocationID.x % out_limits.z);
 
   // Output position for TILE_SIZE = 2
   // +--------+--------+
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 1cdd7315f1..4f123cb833 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -370,11 +370,17 @@ void add_conv2d_node(
       weight_data,
       clamp_out);
 
+  utils::uvec3 wg_size = create_conv2d_global_wg_size(graph, method, out);
+
+  if (method == Conv2dMethod::Pointwise) {
+    wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
+  }
+
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
       shader,
-      create_conv2d_global_wg_size(graph, method, out),
-      graph.create_local_wg_size(out),
+      wg_size,
+      graph.create_local_wg_size(wg_size),
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
        {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},