Nexesenex
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 0 additions & 20 deletions b/‎ggml/include/ggml.h‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎ggml/src/ggml-alloc.c‎
Lines changed: 0 additions & 1 deletion b/‎ggml/src/ggml-alloc.c‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 0 additions & 5 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 0 additions & 121 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 0 additions & 121 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.h‎
Lines changed: 0 additions & 1 deletion b/‎ggml/src/ggml-cpu/ops.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/vec.h‎
Lines changed: 0 additions & 35 deletions b/‎ggml/src/ggml-cpu/vec.h‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 0 additions & 4 deletions b/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 0 additions & 4 deletions
@@ -635,7 +635,6 @@ extern "C" {
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
         GGML_OP_SOFTCAP,
-        GGML_OP_SOFT_CAP_MAX,
 
         GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_ATTN_BACK,
@@ -1478,25 +1477,6 @@ extern "C" {
             float                 s_before,
             float                 s_after);
 
-    GGML_API struct ggml_tensor * ggml_softcap_max(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * mask,
-            float                 scale,
-            float                 max_bias,
-            float                 s_before,
-            float                 s_after);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_softcap_max_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * mask,
-            float                 scale,
-            float                 max_bias,
-            float                 s_before,
-            float                 s_after);
-
     // b -> view(a,offset,nb1,nb2,3), return modified a
     GGML_API struct ggml_tensor * ggml_set(
             struct ggml_context * ctx,
 
@@ -46,7 +46,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_SOFT_MAX:
         case GGML_OP_SOFT_MAX_BACK:
         case GGML_OP_SOFTCAP:	
-        case GGML_OP_SOFT_CAP_MAX:
             return true;
 
         default:
 
@@ -2546,10 +2546,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_softcap(params, tensor);
             } break;
-        case GGML_OP_SOFT_CAP_MAX:
-            {
-                ggml_compute_forward_softcap_max(params, tensor);
-            } break;
         case GGML_OP_SET:
             {
                 ggml_compute_forward_set(params, tensor);
@@ -3004,7 +3000,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_SOFTCAP:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_SOFT_CAP_MAX:
             {
                 n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
             } break;
 
@@ -5559,127 +5559,6 @@ void ggml_compute_forward_softcap(
     }
 }
 
-// ggml_compute_forward_softcap_max
-
-static void ggml_compute_forward_softcap_max_f32(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    float values[4];
-    memcpy(values, dst->op_params, sizeof(values));
-
-    //memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
-    //memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //const int64_t ne11 = src1 ? src1->ne[1] : 1;
-
-    // TODO: is this supposed to be ceil instead of floor?
-    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
-    const uint32_t n_head      = ne02;
-    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(values[1]       ) / n_head_log2);
-    const float m1 = powf(2.0f, -(values[1] / 2.0f) / n_head_log2);
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
-
-    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        // ALiBi
-        const uint32_t h = (i1/ne01)%ne02; // head
-        const float slope = (values[1] > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
-
-        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
-
-        // broadcast the mask across rows
-        ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
-        float       * mp_f32 = src1 ? (float       *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
-
-        ggml_vec_cpy_f32  (nc, wp, sp);
-        ggml_vec_scale_f32(nc, wp, values[0]);
-        if (mp_f32) {
-            if (use_f16) {
-                for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
-                }
-            } else {
-                for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*mp_f32[i];
-                }
-            }
-        }
-
-        ggml_vec_softcap_f32(nc, wp, values[2], values[3]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(wp[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, wp);
-
-        ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
-        assert(sum > 0.0);
-
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, dp, sum);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dp[i]));
-            assert(!isinf(dp[i]));
-        }
-#endif
-    }
-
-}
-
-void ggml_compute_forward_softcap_max(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_softcap_max_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
-}
-
 // ggml_compute_forward_set
 
 static void ggml_compute_forward_set_f32(
 
@@ -67,7 +67,6 @@ void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * para
 void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_softcap(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_softcap_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
@@ -1206,41 +1206,6 @@ static void ggml_vec_softcap_f32(const int n, float * x, float s_before, float s
     }
 }
 
-static float ggml_vec_softcap_max_f32(const int n, float * x, float s_before, float s_after) {
-    int i = 0;
-    float max = -INFINITY;
-#if defined(__AVX512F__) && defined(__AVX512DQ__)
-    __m512 vs_before = _mm512_set1_ps(2.f*s_before);
-    __m512 vs_after  = _mm512_set1_ps(s_after);
-    __m512 vmax = _mm512_set1_ps(-INFINITY);
-    for (; i + 15 < n; i += 16) {
-        __m512 y = ggml_v_softcap(_mm512_loadu_ps(x + i), vs_before, vs_after);
-        _mm512_storeu_ps(x + i, y);
-        vmax = _mm512_max_ps(vmax, y);
-    }
-    max = _mm512_reduce_max_ps(vmax);
-#elif defined(__AVX2__) && defined(__FMA__)
-    for (; i + 7 < n; i += 8) {
-        _mm256_storeu_ps(x + i, ggml_v_softcap(_mm256_loadu_ps(x + i), s_before, s_after));
-    }
-#elif defined(__SSE2__)
-    for (; i + 3 < n; i += 4) {
-        _mm_storeu_ps(x + i, ggml_v_softcap(_mm_loadu_ps(x + i), s_before, s_after));
-    }
-#elif defined(__ARM_NEON) && defined(__aarch64__)
-    float32x4_t vs_before = vdupq_n_f32(s_before);
-    float32x4_t vs_after  = vdupq_n_f32(s_after);
-    for (; i + 3 < n; i += 4) {
-        vst1q_f32(x + i, ggml_v_softcap(vld1q_f32(x + i), vs_before, vs_after));
-    }
-#endif
-    for (; i < n; ++i) {
-        x[i] = s_after*tanhf(x[i]*s_before);
-        max = MAX(max, x[i]);
-    }
-    return max;
-}
-
 //
 // On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
 // via SIMD instructions is faster than the fp16-based lookup table.
 
@@ -2661,9 +2661,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_SOFT_MAX_BACK:
             ggml_cuda_op_soft_max_back(ctx, dst);
             break;
-        case GGML_OP_SOFT_CAP_MAX:
-            ggml_cuda_op_soft_cap_max(ctx, dst);
-            break;
         case GGML_OP_ROPE:
             ggml_cuda_op_rope(ctx, dst);
             break;
@@ -3703,7 +3700,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             return true;
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_SOFT_CAP_MAX:
             return true;
         case GGML_OP_SOFT_MAX_BACK: {
             float max_bias = 0.0f;
Original file line number	Diff line number	Diff line change
`@@ -2546,10 +2546,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`2546`	`2546`	`{`
`2547`	`2547`	`ggml_compute_forward_softcap(params, tensor);`
`2548`	`2548`	`} break;`
`2549`		`- case GGML_OP_SOFT_CAP_MAX:`
`2550`		`- {`
`2551`		`- ggml_compute_forward_softcap_max(params, tensor);`
`2552`		`- } break;`
`2553`	`2549`	`case GGML_OP_SET:`
`2554`	`2550`	`{`
`2555`	`2551`	`ggml_compute_forward_set(params, tensor);`
`@@ -3004,7 +3000,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {`
`3004`	`3000`	`} break;`
`3005`	`3001`	`case GGML_OP_SOFTCAP:`
`3006`	`3002`	`case GGML_OP_SOFT_MAX:`
`3007`		`- case GGML_OP_SOFT_CAP_MAX:`
`3008`	`3003`	`{`
`3009`	`3004`	`n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));`
`3010`	`3005`	`} break;`