Skip to content

Commit

Permalink
Merge branch 'master' into concedo_experimental
Browse files Browse the repository at this point in the history
  • Loading branch information
LostRuins committed Jun 9, 2023
2 parents 73cc5b8 + 92f44ff commit b617f28
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 16 deletions.
18 changes: 17 additions & 1 deletion ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
GGML_METAL_DECL_KERNEL(scale);
GGML_METAL_DECL_KERNEL(silu);
GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu);
GGML_METAL_DECL_KERNEL(soft_max);
GGML_METAL_DECL_KERNEL(diag_mask_inf);
GGML_METAL_DECL_KERNEL(get_rows_f16);
Expand Down Expand Up @@ -135,6 +136,7 @@
GGML_METAL_ADD_KERNEL(scale);
GGML_METAL_ADD_KERNEL(silu);
GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu);
GGML_METAL_ADD_KERNEL(soft_max);
GGML_METAL_ADD_KERNEL(diag_mask_inf);
GGML_METAL_ADD_KERNEL(get_rows_f16);
Expand Down Expand Up @@ -420,6 +422,20 @@ void ggml_metal_graph_compute(

[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_GELU:
{
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder];
}

[encoder setComputePipelineState:ctx->pipeline_gelu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];

const int64_t n = ggml_nelements(dst);

[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SOFT_MAX:
{
if (encoder == nil) {
Expand Down Expand Up @@ -526,7 +542,7 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne12 == 1);

nth0 = 8;
nth1 = 4;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
} break;
case GGML_TYPE_Q2_K:
Expand Down
45 changes: 30 additions & 15 deletions ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,17 @@ kernel void kernel_relu(
dst[tpig] = max(0.0f, src0[tpig]);
}

constant float GELU_COEF_A = 0.044715f;
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;

kernel void kernel_gelu(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig];
dst[tpig] = 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
}

kernel void kernel_soft_max(
device const float * src0,
device float * dst,
Expand Down Expand Up @@ -267,6 +278,8 @@ kernel void kernel_mul_mat_q4_0_f32(
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK4_0;

const int8_t m8 = 8;

const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;

Expand All @@ -276,33 +289,34 @@ kernel void kernel_mul_mat_q4_0_f32(
const uint nth = tptg.x*tptg.y;
const uint ith = tptg.y*tpitg.x + tpitg.y;

sum[ith] = 0.0f;
const int ix = tpitg.y/4; // 0 or 1
const int iy = tpitg.y - 4*ix; // 0...3

for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
device const float4 * y0p = (device const float4 *) (y + i*QK4_0);
const int first = 4 * iy;

const float d = (float)((x + i)->d);
float sumf = 0;

for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {

const float d = (float)x[i].d;

const uchar4 x0v = *(x0p + tpitg.y);
const float4 y0v = *(y0p + tpitg.y + 0);
const float4 y1v = *(y0p + tpitg.y + 4);
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_0 + first;

float acc = 0.0f;
float2 acc = {0.0f, 0.0f};

for (int j = 0; j < 4; ++j) {
const int x0 = x0v[j] & 0x0F;
const int x1 = x0v[j] >> 4;

const float y0 = y0v[j];
const float y1 = y1v[j];
acc[0] += yl[j+ 0] * ((int8_t)(xl[j] & 0xF) - m8);
acc[1] += yl[j+16] * ((int8_t)(xl[j] >> 4) - m8);

acc += (x0 - 8)*y0 + (x1 - 8)*y1;
}

sum[ith] += acc*d;
sumf += d * (acc[0] + acc[1]);
}

sum[ith] = sumf;

//
// Accumulate the sum from all threads in the threadgroup
// This version is slightly faster than the commented out one below,
Expand Down Expand Up @@ -357,6 +371,7 @@ kernel void kernel_mul_mat_f16_f32(
uint3 tpig[[thread_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]]) {

const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
Expand Down

0 comments on commit b617f28

Please sign in to comment.