Skip to content

Commit

Permalink
vulkan : implement Stable Diffusion operators (#904)
Browse files Browse the repository at this point in the history
* Fix Vulkan repeat op

* Implement Vulkan concat op

* Delete old Vulkan shader generator

* Implement Vulkan im2col op

* Implement Vulkan unary gelu_quick op

* Implement Vulkan group_norm op

* Implement Vulkan timestep_embedding op

* Implement Vulkan upscale op

* Fix Vulkan vk_context tensor extra index issue

* Fix Vulkan matmul shader parameter bug

* Properly fix Vulkan matmul shader parameter bug

* Add Vulkan ADD f16 + f32 -> f16 operator support

* Implement Vulkan tanh op

* Fix Vulkan group count too large Validation error on non-Nvidia GPUs

* Throw error when too much memory is requested

* Fix another Vulkan group count too large Validation error on non-Nvidia GPUs

* Fix matmul MMQ condition

* Implement Vulkan pad op

* Fix Vulkan crash when tensor is used multiple times in a compute graph

* Add Vulkan CONCAT f16 + f16 -> f16 op

* Add Vulkan LEAKY_RELU op
  • Loading branch information
0cc4m authored Aug 4, 2024
1 parent 29d87fc commit 18703ad
Show file tree
Hide file tree
Showing 29 changed files with 1,029 additions and 3,412 deletions.
840 changes: 605 additions & 235 deletions src/ggml-vulkan.cpp

Large diffs are not rendered by default.

3,149 changes: 0 additions & 3,149 deletions src/ggml_vk_generate_shaders.py

This file was deleted.

6 changes: 4 additions & 2 deletions src/vulkan-shaders/add.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include "generic_binary_head.comp"

void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();

if (idx >= p.ne) {
return;
}

data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
}
8 changes: 5 additions & 3 deletions src/vulkan-shaders/clamp.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
#include "generic_unary_head.comp"

void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();

if (idx >= p.ne) {
return;
}

const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
}
35 changes: 35 additions & 0 deletions src/vulkan-shaders/concat.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#version 450

#include "types.comp"
#include "generic_binary_head.comp"

void main() {
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
const int dim = p.param3;

if (idx >= p.ne) {
return;
}

const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
const uint i2_offset = i2*p.ne21*p.ne20;
const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;

uint o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));

const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;

const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;

#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
#else
data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
#endif
}
8 changes: 5 additions & 3 deletions src/vulkan-shaders/copy.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
#include "generic_unary_head.comp"

void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();

if (idx >= p.ne) {
return;
}

#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
#else
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
#endif
}
6 changes: 4 additions & 2 deletions src/vulkan-shaders/div.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include "generic_binary_head.comp"

void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();

if (idx >= p.ne) {
return;
}

data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
}
2 changes: 1 addition & 1 deletion src/vulkan-shaders/gelu.comp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
const uint i = gl_GlobalInvocationID.x;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

if (i >= p.KX) {
return;
Expand Down
23 changes: 23 additions & 0 deletions src/vulkan-shaders/gelu_quick.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#version 450

#include "generic_head.comp"
#include "types.comp"

#extension GL_EXT_control_flow_attributes : enable

layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

void main() {
const float GELU_QUICK_COEF = -1.702f;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

if (i >= p.KX) {
return;
}

const float x = float(data_a[i]);
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
}
6 changes: 5 additions & 1 deletion src/vulkan-shaders/generic_binary_head.comp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
uint d_offset;
float param1; float param2;
float param1; float param2; int param3;
} p;

layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
Expand All @@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}

uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
Expand Down
4 changes: 4 additions & 0 deletions src/vulkan-shaders/generic_unary_head.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}

uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
Expand Down
66 changes: 66 additions & 0 deletions src/vulkan-shaders/group_norm.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#version 450

#include "generic_head.comp"
#include "types.comp"

#extension GL_EXT_control_flow_attributes : enable
#define BLOCK_SIZE 512

layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

shared float tmp[BLOCK_SIZE];

void main() {
const uint group_size = p.KX;
const float eps = p.param1;

const uint tid = gl_LocalInvocationID.x;
const uint start = gl_WorkGroupID.x * group_size + tid;
const uint end = start + group_size;

tmp[tid] = 0.0f;

// Calculate mean
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
tmp[tid] += float(data_a[col]);
}

// tmp up partial tmps and write back result
barrier();
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier();
}

const float mean = tmp[0] / group_size;
barrier();
tmp[tid] = 0.0f;

// Calculate variance
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
const float xi = float(data_a[col]) - mean;
data_d[col] = D_TYPE(xi);
tmp[tid] += xi * xi;
}

// sum up partial sums and write back result
barrier();
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier();
}

const float variance = tmp[0] / group_size;
const float scale = inversesqrt(variance + eps);

[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
data_d[col] *= D_TYPE(scale);
}
}
57 changes: 57 additions & 0 deletions src/vulkan-shaders/im2col.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#version 450

#extension GL_EXT_shader_16bit_storage : require

layout (push_constant) uniform parameter
{
uint batch_offset; uint offset_delta;
uint IC;
uint IW; uint IH;
uint OW; uint OH;
uint KW; uint KH;
uint pelements;
uint CHW;
int s0; int s1;
int p0; int p1;
int d0; int d1;
} p;

#include "types.comp"

#define BLOCK_SIZE 256

layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

void main() {
const uint i = gl_GlobalInvocationID.x;
if (i >= p.pelements) {
return;
}

const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;

const uint oh = gl_GlobalInvocationID.y;
const uint batch = gl_GlobalInvocationID.z / p.IC;
const uint ic = gl_GlobalInvocationID.z % p.IC;

const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;

const uint offset_dst =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);

if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
data_d[offset_dst] = D_TYPE(0.0f);
} else {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
}
}
22 changes: 22 additions & 0 deletions src/vulkan-shaders/leaky_relu.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#version 450

#include "generic_head.comp"
#include "types.comp"

#extension GL_EXT_control_flow_attributes : enable

layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

if (i >= p.KX) {
return;
}

const float val = float(data_a[i]);
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
}
6 changes: 4 additions & 2 deletions src/vulkan-shaders/mul.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include "generic_binary_head.comp"

void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();

if (idx >= p.ne) {
return;
}

data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
}
2 changes: 1 addition & 1 deletion src/vulkan-shaders/norm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared vec2 sum[BLOCK_SIZE];

void main() {
const uint row = gl_WorkGroupID.x;
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;

sum[tid] = vec2(0.0f, 0.0f);
Expand Down
26 changes: 26 additions & 0 deletions src/vulkan-shaders/pad.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#version 450

#include "types.comp"
#include "generic_unary_head.comp"

void main() {
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

if (idx >= p.ne) {
return;
}

const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
const uint i2_offset = i2*p.ne11*p.ne10;
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;

const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;

const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;

data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
}
2 changes: 1 addition & 1 deletion src/vulkan-shaders/relu.comp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

void main() {
const uint i = gl_GlobalInvocationID.x;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

if (i >= p.KX) {
return;
Expand Down
2 changes: 1 addition & 1 deletion src/vulkan-shaders/rms_norm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared FLOAT_TYPE sum[BLOCK_SIZE];

void main() {
const uint row = gl_WorkGroupID.x;
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;

sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
Expand Down
6 changes: 4 additions & 2 deletions src/vulkan-shaders/scale.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include "generic_unary_head.comp"

void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();

if (idx >= p.ne) {
return;
}

data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
}
Loading

0 comments on commit 18703ad

Please sign in to comment.