Skip to content

Commit 1dea45e

Browse files
committed
Revert "Reapply "IK Softcap_max" corrected"
1 parent 9118bcf commit 1dea45e

File tree

11 files changed

+28
-335
lines changed

11 files changed

+28
-335
lines changed

ggml/include/ggml.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,6 @@ extern "C" {
635635
GGML_OP_ARGSORT,
636636
GGML_OP_LEAKY_RELU,
637637
GGML_OP_SOFTCAP,
638-
GGML_OP_SOFT_CAP_MAX,
639638

640639
GGML_OP_FLASH_ATTN_EXT,
641640
GGML_OP_FLASH_ATTN_BACK,
@@ -1478,25 +1477,6 @@ extern "C" {
14781477
float s_before,
14791478
float s_after);
14801479

1481-
GGML_API struct ggml_tensor * ggml_softcap_max(
1482-
struct ggml_context * ctx,
1483-
struct ggml_tensor * a,
1484-
struct ggml_tensor * mask,
1485-
float scale,
1486-
float max_bias,
1487-
float s_before,
1488-
float s_after);
1489-
1490-
// in-place, returns view(a)
1491-
GGML_API struct ggml_tensor * ggml_softcap_max_inplace(
1492-
struct ggml_context * ctx,
1493-
struct ggml_tensor * a,
1494-
struct ggml_tensor * mask,
1495-
float scale,
1496-
float max_bias,
1497-
float s_before,
1498-
float s_after);
1499-
15001480
// b -> view(a,offset,nb1,nb2,3), return modified a
15011481
GGML_API struct ggml_tensor * ggml_set(
15021482
struct ggml_context * ctx,

ggml/src/ggml-alloc.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
4646
case GGML_OP_SOFT_MAX:
4747
case GGML_OP_SOFT_MAX_BACK:
4848
case GGML_OP_SOFTCAP:
49-
case GGML_OP_SOFT_CAP_MAX:
5049
return true;
5150

5251
default:

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2546,10 +2546,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
25462546
{
25472547
ggml_compute_forward_softcap(params, tensor);
25482548
} break;
2549-
case GGML_OP_SOFT_CAP_MAX:
2550-
{
2551-
ggml_compute_forward_softcap_max(params, tensor);
2552-
} break;
25532549
case GGML_OP_SET:
25542550
{
25552551
ggml_compute_forward_set(params, tensor);
@@ -3004,7 +3000,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
30043000
} break;
30053001
case GGML_OP_SOFTCAP:
30063002
case GGML_OP_SOFT_MAX:
3007-
case GGML_OP_SOFT_CAP_MAX:
30083003
{
30093004
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
30103005
} break;

ggml/src/ggml-cpu/ops.cpp

Lines changed: 0 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -5559,127 +5559,6 @@ void ggml_compute_forward_softcap(
55595559
}
55605560
}
55615561

5562-
// ggml_compute_forward_softcap_max
5563-
5564-
static void ggml_compute_forward_softcap_max_f32(
5565-
const ggml_compute_params * params,
5566-
ggml_tensor * dst) {
5567-
5568-
const struct ggml_tensor * src0 = dst->src[0];
5569-
const struct ggml_tensor * src1 = dst->src[1];
5570-
5571-
assert(ggml_is_contiguous(dst));
5572-
assert(ggml_are_same_shape(src0, dst));
5573-
5574-
float values[4];
5575-
memcpy(values, dst->op_params, sizeof(values));
5576-
5577-
//memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
5578-
//memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
5579-
5580-
// TODO: handle transposed/permuted matrices
5581-
5582-
const int ith = params->ith;
5583-
const int nth = params->nth;
5584-
5585-
GGML_TENSOR_UNARY_OP_LOCALS
5586-
5587-
//const int64_t ne11 = src1 ? src1->ne[1] : 1;
5588-
5589-
// TODO: is this supposed to be ceil instead of floor?
5590-
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
5591-
const uint32_t n_head = ne02;
5592-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
5593-
5594-
const float m0 = powf(2.0f, -(values[1] ) / n_head_log2);
5595-
const float m1 = powf(2.0f, -(values[1] / 2.0f) / n_head_log2);
5596-
5597-
const int nc = src0->ne[0];
5598-
const int nr = ggml_nrows(src0);
5599-
5600-
// rows per thread
5601-
const int dr = (nr + nth - 1)/nth;
5602-
5603-
// row range for this thread
5604-
const int ir0 = dr*ith;
5605-
const int ir1 = MIN(ir0 + dr, nr);
5606-
5607-
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
5608-
5609-
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
5610-
5611-
for (int i1 = ir0; i1 < ir1; i1++) {
5612-
// ALiBi
5613-
const uint32_t h = (i1/ne01)%ne02; // head
5614-
const float slope = (values[1] > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
5615-
5616-
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
5617-
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
5618-
5619-
// broadcast the mask across rows
5620-
ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
5621-
float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL;
5622-
5623-
ggml_vec_cpy_f32 (nc, wp, sp);
5624-
ggml_vec_scale_f32(nc, wp, values[0]);
5625-
if (mp_f32) {
5626-
if (use_f16) {
5627-
for (int i = 0; i < nc; ++i) {
5628-
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
5629-
}
5630-
} else {
5631-
for (int i = 0; i < nc; ++i) {
5632-
wp[i] += slope*mp_f32[i];
5633-
}
5634-
}
5635-
}
5636-
5637-
ggml_vec_softcap_f32(nc, wp, values[2], values[3]);
5638-
5639-
#ifndef NDEBUG
5640-
for (int i = 0; i < nc; ++i) {
5641-
//printf("p[%d] = %f\n", i, p[i]);
5642-
assert(!isnan(wp[i]));
5643-
}
5644-
#endif
5645-
5646-
float max = -INFINITY;
5647-
ggml_vec_max_f32(nc, &max, wp);
5648-
5649-
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
5650-
assert(sum > 0.0);
5651-
5652-
sum = 1.0/sum;
5653-
ggml_vec_scale_f32(nc, dp, sum);
5654-
5655-
#ifndef NDEBUG
5656-
for (int i = 0; i < nc; ++i) {
5657-
assert(!isnan(dp[i]));
5658-
assert(!isinf(dp[i]));
5659-
}
5660-
#endif
5661-
}
5662-
5663-
}
5664-
5665-
void ggml_compute_forward_softcap_max(
5666-
const ggml_compute_params * params,
5667-
ggml_tensor * dst) {
5668-
5669-
const struct ggml_tensor * src0 = dst->src[0];
5670-
5671-
switch (src0->type) {
5672-
case GGML_TYPE_F32:
5673-
{
5674-
ggml_compute_forward_softcap_max_f32(params, dst);
5675-
} break;
5676-
default:
5677-
{
5678-
GGML_ASSERT(false);
5679-
}
5680-
}
5681-
}
5682-
56835562
// ggml_compute_forward_set
56845563

56855564
static void ggml_compute_forward_set_f32(

ggml/src/ggml-cpu/ops.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * para
6767
void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6868
void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
6969
void ggml_compute_forward_softcap(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70-
void ggml_compute_forward_softcap_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7170
void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7271
void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
7372
void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);

ggml/src/ggml-cpu/vec.h

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,41 +1206,6 @@ static void ggml_vec_softcap_f32(const int n, float * x, float s_before, float s
12061206
}
12071207
}
12081208

1209-
static float ggml_vec_softcap_max_f32(const int n, float * x, float s_before, float s_after) {
1210-
int i = 0;
1211-
float max = -INFINITY;
1212-
#if defined(__AVX512F__) && defined(__AVX512DQ__)
1213-
__m512 vs_before = _mm512_set1_ps(2.f*s_before);
1214-
__m512 vs_after = _mm512_set1_ps(s_after);
1215-
__m512 vmax = _mm512_set1_ps(-INFINITY);
1216-
for (; i + 15 < n; i += 16) {
1217-
__m512 y = ggml_v_softcap(_mm512_loadu_ps(x + i), vs_before, vs_after);
1218-
_mm512_storeu_ps(x + i, y);
1219-
vmax = _mm512_max_ps(vmax, y);
1220-
}
1221-
max = _mm512_reduce_max_ps(vmax);
1222-
#elif defined(__AVX2__) && defined(__FMA__)
1223-
for (; i + 7 < n; i += 8) {
1224-
_mm256_storeu_ps(x + i, ggml_v_softcap(_mm256_loadu_ps(x + i), s_before, s_after));
1225-
}
1226-
#elif defined(__SSE2__)
1227-
for (; i + 3 < n; i += 4) {
1228-
_mm_storeu_ps(x + i, ggml_v_softcap(_mm_loadu_ps(x + i), s_before, s_after));
1229-
}
1230-
#elif defined(__ARM_NEON) && defined(__aarch64__)
1231-
float32x4_t vs_before = vdupq_n_f32(s_before);
1232-
float32x4_t vs_after = vdupq_n_f32(s_after);
1233-
for (; i + 3 < n; i += 4) {
1234-
vst1q_f32(x + i, ggml_v_softcap(vld1q_f32(x + i), vs_before, vs_after));
1235-
}
1236-
#endif
1237-
for (; i < n; ++i) {
1238-
x[i] = s_after*tanhf(x[i]*s_before);
1239-
max = MAX(max, x[i]);
1240-
}
1241-
return max;
1242-
}
1243-
12441209
//
12451210
// On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
12461211
// via SIMD instructions is faster than the fp16-based lookup table.

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2661,9 +2661,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
26612661
case GGML_OP_SOFT_MAX_BACK:
26622662
ggml_cuda_op_soft_max_back(ctx, dst);
26632663
break;
2664-
case GGML_OP_SOFT_CAP_MAX:
2665-
ggml_cuda_op_soft_cap_max(ctx, dst);
2666-
break;
26672664
case GGML_OP_ROPE:
26682665
ggml_cuda_op_rope(ctx, dst);
26692666
break;
@@ -3703,7 +3700,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
37033700
return true;
37043701
case GGML_OP_DIAG_MASK_INF:
37053702
case GGML_OP_SOFT_MAX:
3706-
case GGML_OP_SOFT_CAP_MAX:
37073703
return true;
37083704
case GGML_OP_SOFT_MAX_BACK: {
37093705
float max_bias = 0.0f;

0 commit comments

Comments
 (0)