* Apple/Win32 compile errors fixed

etasnadi · etasnadi · commit 7f9b6591c2fe · 2025-07-14T00:12:43.000+02:00
* Subgroup size used to determine tile size -&gt; fixes llvmpipe errors.
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1883,12 +1883,12 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 }
 
 bool ggml_backend_compare_graph_backend_node(
-    ggml_backend_t backend1, 
-    ggml_backend_t backend2, 
-    struct ggml_cgraph * graph1, 
-    struct ggml_cgraph * graph2, 
+    ggml_backend_t backend1,
+    ggml_backend_t backend2,
+    struct ggml_cgraph * graph1,
+    struct ggml_cgraph * graph2,
     ggml_backend_eval_callback callback, void * user_data, char* op_name_out_1, char* op_name_out_2) {
-    
+
     ggml_tensor * out1 = NULL;
     ggml_tensor * out2 = NULL;
 
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -880,7 +880,7 @@ struct vk_op_conv2d_push_constants {
     uint32_t Cout;
     uint32_t Cin;
     uint32_t N;
-    
+
     uint32_t KW;
     uint32_t KH;
     uint32_t W;
@@ -1041,7 +1041,7 @@ class vk_perf_logger {
         }
 
         timings.clear();
-        flops.clear();        
+        flops.clear();
     }
 
     void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -1082,7 +1082,7 @@ class vk_perf_logger {
             flops[name].push_back(n_flops);
             timings[name].push_back(time);
             return;
-        }        
+        }
         timings[ggml_op_name(node->op)].push_back(time);
     }
 private:
@@ -2190,6 +2190,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
             }
             compile_count++;
         }
+
         compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint,
                                       parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
     };
@@ -3037,14 +3038,27 @@ static void ggml_vk_load_shaders(vk_device& device) {
     uint32_t conv2d_WG_SIZE = 256;
     uint32_t conv2d_BS_K = 128;
     uint32_t conv2d_BS_CRS = 16;
+    // Enables subgroup ops for preventing the re-calculation of indices.
+    uint32_t use_collectives = 0;
+    // CRS block size should be capped at sugroup size for correctness when shuffle is used.
+    if(device->subgroup_shuffle){
+        use_collectives = 1;
+        conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
+    }
     uint32_t conv2d_BS_NPQ = 128;
     uint32_t conv2d_TS_K = 8;
     uint32_t conv2d_shmem_req = (conv2d_BS_K*(conv2d_BS_CRS+1) + conv2d_BS_CRS*(conv2d_BS_NPQ+1))*sizeof(float);
     if(device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req){
         conv2d_BS_CRS = 8;
-        conv2d_TS_K = 8;
-    }    
-    ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K}, 1);
+        if(device->subgroup_shuffle){
+            conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
+        }
+    }
+    if(device->subgroup_shuffle){
+        ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true, true);
+    }else{
+        ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives}, 1, true);
+    }
 
     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6895,11 +6909,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         }
         return nullptr;
     case GGML_OP_CONV_2D:
-        if (src0->type == GGML_TYPE_F32 && 
-                src1->type == GGML_TYPE_F32 && 
-                dst->type == GGML_TYPE_F32 && 
-                ggml_is_contiguous(src0) && 
-                ggml_is_contiguous(src1) && 
+        if (src0->type == GGML_TYPE_F32 &&
+                src1->type == GGML_TYPE_F32 &&
+                dst->type == GGML_TYPE_F32 &&
+                ggml_is_contiguous(src0) &&
+                ggml_is_contiguous(src1) &&
                 ggml_is_contiguous(dst)) {
             return ctx->device->pipeline_conv2d_f32;
         }
@@ -7231,7 +7245,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             // src0 - kernel:   [KW, KH, Cin, Cout]
             // src1 - input:    [W, H, Cin, N]
             // dst - result:    [OW, OH, Cout, N]
-            
+
             // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
             auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
                 return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
@@ -7246,9 +7260,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
             int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
             int64_t NPQ = N*OW*OH;
-            
+
             // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
-            elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1}; 
+            elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
         } break;
     case GGML_OP_ADD:
     case GGML_OP_SUB:
@@ -8131,14 +8145,14 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
     p.Cout = static_cast<uint32_t>(ne03);
     p.Cin = static_cast<uint32_t>(ne02);
     p.N = static_cast<uint32_t>(ne13);
-    
+
     p.KW = static_cast<uint32_t>(ne00);
     p.KH = static_cast<uint32_t>(ne01);
     p.W = static_cast<uint32_t>(ne10);
     p.H = static_cast<uint32_t>(ne11);
     p.OW = static_cast<uint32_t>(ne0);
     p.OH = static_cast<uint32_t>(ne1);
-    
+
     p.s0 = static_cast<uint32_t>(dst->op_params[0]);
     p.s1 = static_cast<uint32_t>(dst->op_params[1]);
     p.p0 = static_cast<uint32_t>(dst->op_params[2]);
@@ -8162,7 +8176,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
     GGML_ASSERT(ne02 == ne12);
 
     ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
-    
+
 }
 
 static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -10805,11 +10819,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
         case GGML_OP_CONV_2D:
             // Channel-contiguous format is not supported yet.
-            return (op->src[0]->type == GGML_TYPE_F32 && 
-                op->src[1]->type == GGML_TYPE_F32 && 
-                op->type == GGML_TYPE_F32 && 
-                ggml_is_contiguous(op->src[0]) && 
-                ggml_is_contiguous(op->src[1]) && 
+            return (op->src[0]->type == GGML_TYPE_F32 &&
+                op->src[1]->type == GGML_TYPE_F32 &&
+                op->type == GGML_TYPE_F32 &&
+                ggml_is_contiguous(op->src[0]) &&
+                ggml_is_contiguous(op->src[1]) &&
                 ggml_is_contiguous(op));
         default:
             return false;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -1,7 +1,5 @@
 #version 450
 
-#define USE_COLLECTIVES
-
 #ifdef USE_COLLECTIVES
 #extension GL_KHR_shader_subgroup_shuffle: enable
 #endif
@@ -12,7 +10,7 @@
 #define SHMEM_PAD 0
 
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
-layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};    // src0 - kernel:   [KW, KH, Cin, Cout] 
+layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};    // src0 - kernel:   [KW, KH, Cin, Cout]
 layout (binding = 1) readonly buffer B {B_TYPE src_data[];};    // src1 - input:    [W, H, Cin, N] -- channel_first format
 layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};   // dst - result:    [OW, OH, Cout, N]
 
@@ -21,7 +19,7 @@ layout (push_constant) uniform parameter {
     uint32_t Cout;
     uint32_t Cin;
     uint32_t N;
-    
+
     // Tensor spatial sizes: kernel, input, output
     uint32_t KW;
     uint32_t KH;
@@ -59,6 +57,7 @@ layout(constant_id = 2) const uint BS_CRS = 16;
 layout(constant_id = 3) const uint BS_NPQ = 128;
 // Thread-tile sizes
 layout(constant_id = 4) const uint TS_K = 8;
+layout(constant_id = 5) const uint use_collectives = 1;
 
 uint32_t tid = gl_LocalInvocationID.x;
 const uint32_t WG_SIZE = gl_WorkGroupSize.x;
@@ -122,31 +121,48 @@ uint32_t Br = tid / BS_NPQ;
 uint32_t Bc = tid % BS_NPQ;
 const uint32_t BrpWg = WG_SIZE / BS_NPQ;
 
-void main(){\
+void main(){
     for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
         for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){
             regC[T_ly][T_lx] = 0.0;
         }
     }
-    /* Advance block in CRS dim */\
+    /* Advance block in CRS dim */
     for(uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++){
+        uint32_t CRS_idx_a;
+        uint32_t Cin_idx_a;
+        uint32_t KH_idx_a;
+        uint32_t KW_idx_a;
+
         #ifdef USE_COLLECTIVES
-        uint32_t cached_CRS_idx = B_idx_CRS*BS_CRS + gl_SubgroupInvocationID;
-        uint32_t cached_Cin_idx = cached_CRS_idx / (p.KW*p.KH);
-        uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx*p.KW*p.KH);
-        uint32_t cached_KH_idx = cached_CRS_remainder / p.KW;
-        uint32_t cached_KW_idx = cached_CRS_remainder - cached_KH_idx*p.KW;
-
-        uint32_t CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
-        uint32_t Cin_idx_a = subgroupShuffle(cached_Cin_idx, Ac);
-        uint32_t KH_idx_a = subgroupShuffle(cached_KH_idx, Ac);
-        uint32_t KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
+        uint32_t cached_CRS_idx;
+        uint32_t cached_Cin_idx;
+        uint32_t cached_KH_idx;
+        uint32_t cached_KW_idx;
+        if(use_collectives == 1){
+            cached_CRS_idx = B_idx_CRS*BS_CRS + gl_SubgroupInvocationID;
+            cached_Cin_idx = cached_CRS_idx / (p.KW*p.KH);
+            uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx*p.KW*p.KH);
+            cached_KH_idx = cached_CRS_remainder / p.KW;
+            cached_KW_idx = cached_CRS_remainder - cached_KH_idx*p.KW;
+
+            CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
+            Cin_idx_a = subgroupShuffle(cached_Cin_idx, Ac);
+            KH_idx_a = subgroupShuffle(cached_KH_idx, Ac);
+            KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
+        }else{
+            CRS_idx_a = B_idx_CRS*BS_CRS + Ac;          // Global CRS_idx_a (column index of A)
+            Cin_idx_a = CRS_idx_a / (p.KW*p.KH);
+            uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a*p.KW*p.KH;
+            KH_idx_a = CRS_remainder / p.KW;
+            KW_idx_a = CRS_remainder - KH_idx_a*p.KW;
+        }
         #else
-        uint32_t CRS_idx_a = B_idx_CRS*BS_CRS + Ac;          // Global CRS_idx_a (column index of A)
-        uint32_t Cin_idx_a = CRS_idx_a / (p.KW*p.KH);
-        uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a*p.KW*p.KH;
-        uint32_t KH_idx_a = CRS_remainder / p.KW;
-        uint32_t KW_idx_a = CRS_remainder - KH_idx_a*p.KW;
+        CRS_idx_a = B_idx_CRS*BS_CRS + Ac;          // Global CRS_idx_a (column index of A)
+        Cin_idx_a = CRS_idx_a / (p.KW*p.KH);
+        CRS_remainder = CRS_idx_a - Cin_idx_a*p.KW*p.KH;
+        KH_idx_a = CRS_remainder / p.KW;
+        KW_idx_a = CRS_remainder - KH_idx_a*p.KW;
         #endif
 
         /* Load kernel to A_block: (BS_K x BS_CRS)*/
@@ -170,20 +186,32 @@ void main(){\
             uint32_t NPQ_remainder = NPQ_idx - N_idx*p.OH*p.OW;
             uint32_t OH_idx = NPQ_remainder / p.OW;
             uint32_t OW_idx = NPQ_remainder - OH_idx*p.OW;
-            
+
+            uint32_t CRS_idx_b;
+            uint32_t Cin_idx_b;
+            uint32_t KH_idx_b;
+            uint32_t KW_idx_b;
             #ifdef USE_COLLECTIVES
-            uint32_t CRS_idx_b = subgroupShuffle(cached_CRS_idx, r_offset + Br);
-            uint32_t Cin_idx_b = subgroupShuffle(cached_Cin_idx, r_offset + Br);
-            uint32_t KH_idx_b = subgroupShuffle(cached_KH_idx, r_offset + Br);
-            uint32_t KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
+            if(use_collectives == 1){
+                CRS_idx_b = subgroupShuffle(cached_CRS_idx, r_offset + Br);
+                Cin_idx_b = subgroupShuffle(cached_Cin_idx, r_offset + Br);
+                KH_idx_b = subgroupShuffle(cached_KH_idx, r_offset + Br);
+                KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
+            }else{
+                CRS_idx_b = B_idx_CRS*BS_CRS + B_ly;         /* Global CRS index (row index of B) */
+                Cin_idx_b = CRS_idx_b / (p.KW*p.KH);
+                uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b*p.KW*p.KH;
+                KH_idx_b = CRS_remainder / p.KW;
+                KW_idx_b = CRS_remainder - KH_idx_b*p.KW;
+            }
             #else
-            uint32_t CRS_idx_b = B_idx_CRS*BS_CRS + B_ly;         /* Global CRS index (row index of B) */
-            uint32_t Cin_idx_b = CRS_idx_b / (p.KW*p.KH);
+            CRS_idx_b = B_idx_CRS*BS_CRS + B_ly;         /* Global CRS index (row index of B) */
+            Cin_idx_b = CRS_idx_b / (p.KW*p.KH);
             uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b*p.KW*p.KH;
-            uint32_t KH_idx_b = CRS_remainder / p.KW;
-            uint32_t KW_idx_b = CRS_remainder - KH_idx_b*p.KW;
+            KH_idx_b = CRS_remainder / p.KW;
+            KW_idx_b = CRS_remainder - KH_idx_b*p.KW;
             #endif
-            
+
             uint32_t H_idx = OH_idx*p.s1 + KH_idx_b*p.d1 - p.p1;
             uint32_t W_idx = OW_idx*p.s0 + KW_idx_b*p.d0 - p.p0;
             uint32_t src_idx = min(max(W_idx + H_idx*p.nb11 + Cin_idx_b*p.nb12 + N_idx*p.nb13, 0), p.Cin*p.N*p.W*p.H-1);
@@ -223,4 +251,4 @@ void main(){\
             }
         }
     }
-}
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -650,7 +650,7 @@ void process_shaders() {
 
     string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
 
-    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
 
     string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
     string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp