@@ -457,6 +457,7 @@ struct vk_device_struct {
457457 vk_pipeline pipeline_rwkv_wkv6_f32;
458458 vk_pipeline pipeline_rwkv_wkv7_f32;
459459 vk_pipeline pipeline_opt_step_adamw_f32;
460+ vk_pipeline pipeline_conv2d_f32;
460461 vk_pipeline pipeline_conv2d_dw_whcn_f32;
461462 vk_pipeline pipeline_conv2d_dw_cwhn_f32;
462463
@@ -816,6 +817,38 @@ struct vk_op_rwkv_wkv7_push_constants {
816817 uint32_t H;
817818};
818819
820+ struct vk_op_conv2d_push_constants {
821+ uint32_t Cout;
822+ uint32_t Cin;
823+ uint32_t N;
824+
825+ uint32_t KW;
826+ uint32_t KH;
827+ uint32_t W;
828+ uint32_t H;
829+ uint32_t OW;
830+ uint32_t OH;
831+
832+ uint32_t s0;
833+ uint32_t s1;
834+ uint32_t p0;
835+ uint32_t p1;
836+ uint32_t d0;
837+ uint32_t d1;
838+
839+ uint32_t nb01;
840+ uint32_t nb02;
841+ uint32_t nb03;
842+
843+ uint32_t nb11;
844+ uint32_t nb12;
845+ uint32_t nb13;
846+
847+ uint32_t nb1;
848+ uint32_t nb2;
849+ uint32_t nb3;
850+ };
851+
819852struct vk_op_conv2d_dw_push_constants {
820853 uint32_t ne;
821854 uint32_t batches;
@@ -916,16 +949,33 @@ class vk_memory_logger {
916949class vk_perf_logger {
917950public:
918951 void print_timings() {
952+ if(timings.empty()){
953+ return;
954+ }
919955 std::cerr << "----------------\nVulkan Timings:" << std::endl;
920956 for (const auto& t : timings) {
921957 uint64_t total = 0;
922958 for (const auto& time : t.second) {
923959 total += time;
924960 }
925- std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
961+ std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
962+
963+ // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
964+ auto it = flops.find(t.first);
965+ if(it != flops.end() && (it->second).size() == t.second.size()){
966+ uint64_t total_nflops = 0;
967+ for(const auto& elem : it->second){
968+ total_nflops += elem;
969+ }
970+ std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
971+ }
972+
973+
974+ std::cerr << std::endl;
926975 }
927976
928977 timings.clear();
978+ flops.clear();
929979 }
930980
931981 void log_timing(const ggml_tensor * node, uint64_t time) {
@@ -944,12 +994,33 @@ class vk_perf_logger {
944994 name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
945995 }
946996 timings[name].push_back(time);
997+ flops[name].push_back( m*n*(k+(k-1)) );
947998 return;
948999 }
1000+ if(node->op == GGML_OP_CONV_2D){
1001+ std::string name = ggml_op_name(node->op);
1002+ ggml_tensor * knl = node->src[0];
1003+ uint64_t OW = node->ne[0];
1004+ uint64_t OH = node->ne[1];
1005+ uint64_t N = node->ne[3];
1006+ uint64_t Cout = node->ne[2];
1007+ uint64_t KW = knl->ne[0];
1008+ uint64_t KH = knl->ne[1];
1009+ uint64_t Cin = knl->ne[2];
1010+ // KxCRS @ CRSxNPQ = KxNPQ -> M=K, K=CRS, N=NPQ
1011+ uint64_t size_M = Cout;
1012+ uint64_t size_K = Cin*KW*KH;
1013+ uint64_t size_N = N*OW*OH;
1014+ uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
1015+ flops[name].push_back(n_flops);
1016+ timings[name].push_back(time);
1017+ return;
1018+ }
9491019 timings[ggml_op_name(node->op)].push_back(time);
9501020 }
9511021private:
9521022 std::map<std::string, std::vector<uint64_t>> timings;
1023+ std::map<std::string, std::vector<uint64_t>> flops;
9531024};
9541025
9551026struct ggml_backend_vk_context {
@@ -2806,6 +2877,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
28062877
28072878 ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
28082879
2880+ ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
2881+
28092882 ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
28102883 ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
28112884
@@ -6578,6 +6651,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
65786651 return ctx->device->pipeline_leaky_relu_f32;
65796652 }
65806653 return nullptr;
6654+ case GGML_OP_CONV_2D:
6655+ if (src0->type == GGML_TYPE_F32 &&
6656+ src1->type == GGML_TYPE_F32 &&
6657+ dst->type == GGML_TYPE_F32 &&
6658+ ggml_is_contiguous(src0) &&
6659+ ggml_is_contiguous(src1) &&
6660+ ggml_is_contiguous(dst)) {
6661+ return ctx->device->pipeline_conv2d_f32;
6662+ }
6663+ return nullptr;
65816664 case GGML_OP_CONV_2D_DW:
65826665 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
65836666 if (ggml_is_contiguous(src1)) {
@@ -6899,6 +6982,30 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68996982 const uint32_t OW = dst->ne[0];
69006983 elements = { N * OC * OH * OW, 1, 1};
69016984 } break;
6985+ case GGML_OP_CONV_2D:
6986+ {
6987+ // src0 - kernel: [KW, KH, Cin, Cout]
6988+ // src1 - input: [W, H, Cin, N]
6989+ // dst - result: [OW, OH, Cout, N]
6990+
6991+ // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6992+ auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6993+ return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6994+ };
6995+ // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6996+ int64_t W = src1->ne[0];
6997+ int64_t H = src1->ne[1];
6998+ int64_t KW = src0->ne[0];
6999+ int64_t KH = src0->ne[1];
7000+ int64_t Cout = src0->ne[3];
7001+ int64_t N = src1->ne[3];
7002+ int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7003+ int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7004+ int64_t NPQ = N*OW*OH;
7005+
7006+ // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7007+ elements = {static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1};
7008+ } break;
69027009 case GGML_OP_ADD:
69037010 case GGML_OP_SUB:
69047011 case GGML_OP_DIV:
@@ -7753,6 +7860,55 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
77537860 }, dryrun);
77547861}
77557862
7863+ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7864+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
7865+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7866+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7867+
7868+ GGML_TENSOR_BINARY_OP_LOCALS
7869+
7870+ GGML_ASSERT(nb00 == sizeof(float));
7871+ GGML_ASSERT(nb10 == sizeof(float));
7872+ GGML_ASSERT(nb0 == sizeof(float));
7873+
7874+ vk_op_conv2d_push_constants p{};
7875+ p.Cout = static_cast<uint32_t>(ne03);
7876+ p.Cin = static_cast<uint32_t>(ne02);
7877+ p.N = static_cast<uint32_t>(ne13);
7878+
7879+ p.KW = static_cast<uint32_t>(ne00);
7880+ p.KH = static_cast<uint32_t>(ne01);
7881+ p.W = static_cast<uint32_t>(ne10);
7882+ p.H = static_cast<uint32_t>(ne11);
7883+ p.OW = static_cast<uint32_t>(ne0);
7884+ p.OH = static_cast<uint32_t>(ne1);
7885+
7886+ p.s0 = static_cast<uint32_t>(dst->op_params[0]);
7887+ p.s1 = static_cast<uint32_t>(dst->op_params[1]);
7888+ p.p0 = static_cast<uint32_t>(dst->op_params[2]);
7889+ p.p1 = static_cast<uint32_t>(dst->op_params[3]);
7890+ p.d0 = static_cast<uint32_t>(dst->op_params[4]);
7891+ p.d1 = static_cast<uint32_t>(dst->op_params[5]);
7892+
7893+ p.nb01 = static_cast<uint32_t>(nb01/nb00);
7894+ p.nb02 = static_cast<uint32_t>(nb02/nb00);
7895+ p.nb03 = static_cast<uint32_t>(nb03/nb00);
7896+
7897+ p.nb11 = static_cast<uint32_t>(nb11/nb10);
7898+ p.nb12 = static_cast<uint32_t>(nb12/nb10);
7899+ p.nb13 = static_cast<uint32_t>(nb13/nb10);
7900+
7901+ p.nb1 = static_cast<uint32_t>(nb1 / nb0);
7902+ p.nb2 = static_cast<uint32_t>(nb2 / nb0);
7903+ p.nb3 = static_cast<uint32_t>(nb3 / nb0);
7904+
7905+ GGML_ASSERT(ne03 == ne2);
7906+ GGML_ASSERT(ne02 == ne12);
7907+
7908+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
7909+
7910+ }
7911+
77567912static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
77577913 vk_op_conv2d_dw_push_constants p{};
77587914 p.ne = ggml_nelements(dst);
@@ -8799,6 +8955,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
87998955 case GGML_OP_TIMESTEP_EMBEDDING:
88008956 case GGML_OP_CONV_TRANSPOSE_1D:
88018957 case GGML_OP_POOL_2D:
8958+ case GGML_OP_CONV_2D:
88028959 case GGML_OP_CONV_2D_DW:
88038960 case GGML_OP_RWKV_WKV6:
88048961 case GGML_OP_RWKV_WKV7:
@@ -8864,6 +9021,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
88649021 case GGML_OP_TIMESTEP_EMBEDDING:
88659022 case GGML_OP_CONV_TRANSPOSE_1D:
88669023 case GGML_OP_POOL_2D:
9024+ case GGML_OP_CONV_2D:
88679025 case GGML_OP_CONV_2D_DW:
88689026 case GGML_OP_LEAKY_RELU:
88699027 {
@@ -9042,6 +9200,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
90429200 case GGML_OP_POOL_2D:
90439201 ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
90449202
9203+ break;
9204+ case GGML_OP_CONV_2D:
9205+ ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun);
9206+
90459207 break;
90469208 case GGML_OP_CONV_2D_DW:
90479209 ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun);
@@ -9168,6 +9330,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
91689330 case GGML_OP_TIMESTEP_EMBEDDING:
91699331 case GGML_OP_CONV_TRANSPOSE_1D:
91709332 case GGML_OP_POOL_2D:
9333+ case GGML_OP_CONV_2D:
91719334 case GGML_OP_CONV_2D_DW:
91729335 case GGML_OP_RWKV_WKV6:
91739336 case GGML_OP_RWKV_WKV7:
@@ -10242,6 +10405,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1024210405 return true;
1024310406 case GGML_OP_CONV_TRANSPOSE_1D:
1024410407 return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
10408+ case GGML_OP_CONV_2D:
10409+ // Channel-contiguous format is not supported yet.
10410+ return (op->src[0]->type == GGML_TYPE_F32 &&
10411+ op->src[1]->type == GGML_TYPE_F32 &&
10412+ op->type == GGML_TYPE_F32 &&
10413+ ggml_is_contiguous(op->src[0]) &&
10414+ ggml_is_contiguous(op->src[1]) &&
10415+ ggml_is_contiguous(op));
1024510416 default:
1024610417 return false;
1024710418 }
@@ -10765,6 +10936,14 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
1076510936 const int32_t p1 = tensor->op_params[6];
1076610937
1076710938 tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
10939+ } else if (tensor->op == GGML_OP_CONV_2D) {
10940+ const int32_t s0 = tensor->op_params[0];
10941+ const int32_t s1 = tensor->op_params[1];
10942+ const int32_t p0 = tensor->op_params[2];
10943+ const int32_t p1 = tensor->op_params[3];
10944+ const int32_t d0 = tensor->op_params[4];
10945+ const int32_t d1 = tensor->op_params[5];
10946+ tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
1076810947 } else if (tensor->op == GGML_OP_LEAKY_RELU) {
1076910948 const float * op_params = (const float *)tensor->op_params;
1077010949 tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
0 commit comments