Skip to content

Commit cc36620

Browse files
committed
Revert "Fused soft cap and SIMD-ified GeLU #9"
This reverts commit bb500aa.
1 parent 7bbe9e6 commit cc36620

File tree

9 files changed

+16
-424
lines changed

9 files changed

+16
-424
lines changed

ggml/include/ggml.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,6 @@ extern "C" {
544544
GGML_OP_TIMESTEP_EMBEDDING,
545545
GGML_OP_ARGSORT,
546546
GGML_OP_LEAKY_RELU,
547-
GGML_OP_SOFTCAP,
548547

549548
GGML_OP_FLASH_ATTN_EXT,
550549
GGML_OP_FLASH_ATTN_BACK,
@@ -1208,19 +1207,6 @@ extern "C" {
12081207
struct ggml_tensor * a,
12091208
float s);
12101209

1211-
GGML_API struct ggml_tensor * ggml_softcap(
1212-
struct ggml_context * ctx,
1213-
struct ggml_tensor * a,
1214-
float s_before,
1215-
float s_after);
1216-
1217-
// in-place, returns view(a)
1218-
GGML_API struct ggml_tensor * ggml_softcap_inplace(
1219-
struct ggml_context * ctx,
1220-
struct ggml_tensor * a,
1221-
float s_before,
1222-
float s_after);
1223-
12241210
// b -> view(a,offset,nb1,nb2,3), return modified a
12251211
GGML_API struct ggml_tensor * ggml_set(
12261212
struct ggml_context * ctx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 254 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,15 +1887,15 @@ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float *
18871887
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
18881888
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
18891889

1890-
static const float GELU_QUICK_COEF = -1.702f;
18911890
static const float GELU_COEF_A = 0.044715f;
1891+
static const float GELU_QUICK_COEF = -1.702f;
18921892
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
18931893

18941894
inline static float ggml_gelu_f32(float x) {
18951895
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
18961896
}
18971897

1898-
/* inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1898+
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
18991899
const uint16_t * i16 = (const uint16_t *) x;
19001900
for (int i = 0; i < n; ++i) {
19011901
y[i] = ggml_table_gelu_f16[i16[i]];
@@ -1923,7 +1923,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
19231923
y[i] = ggml_gelu_f32(x[i]);
19241924
}
19251925
}
1926-
#endif */
1926+
#endif
19271927

19281928
inline static float ggml_gelu_quick_f32(float x) {
19291929
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
@@ -1999,34 +1999,7 @@ inline static float32x4_t ggml_v_silu(float32x4_t x) {
19991999
const float32x4_t neg_x = vsubq_f32(zero, x);
20002000
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
20012001
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
2002-
// return vdivq_f32(x, one_plus_exp_neg_x);
2003-
const uint32x4_t mask = vcgtq_f32(x, vdupq_n_f32(10.f));
2004-
const float32x4_t res = vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
2005-
return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(one), mask), vbicq_u32(vreinterpretq_u32_f32(res), mask)));
2006-
//return vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
2007-
}
2008-
2009-
inline static float32x4_t ggml_v_softcap(float32x4_t x, float32x4_t s_before, float32x4_t s_after) {
2010-
return vmulq_f32(s_after, ggml_v_tanh(vmulq_f32(x, s_before)));
2011-
//const float32x4_t one = vdupq_n_f32(1.0f);
2012-
//const float32x4_t two_x = vmulq_f32(x, s_before);
2013-
//const float32x4_t exp_two_x = ggml_v_expf(two_x);
2014-
//const float32x4_t th = vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one));
2015-
//return vmulq_f32(th, s_after);
2016-
}
2017-
2018-
2019-
// Slower than lookup on my M2-Max
2020-
inline static float32x4_t ggml_v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
2021-
const float32x4_t one = vdupq_n_f32(1.0f);
2022-
//float32x4_t arg = vaddq_f32(one, vmulq_f32(vmulq_f32(x, x), c1));
2023-
float32x4_t arg = vfmaq_f32(one, c1, vmulq_f32(x, x));
2024-
arg = vmulq_f32(arg, vmulq_f32(x, c2));
2025-
float32x4_t exp_arg = ggml_v_expf(arg);
2026-
float32x4_t gelu = vmulq_f32(x, vdivq_f32(exp_arg, vaddq_f32(exp_arg, one)));
2027-
uint32x4_t mask = vcgtq_f32(x, vdupq_n_f32(10.f));
2028-
return vbslq_f32(mask, x, gelu);
2029-
//return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(x), mask), vbicq_u32(vreinterpretq_u32_f32(gelu), mask)));
2002+
return vdivq_f32(x, one_plus_exp_neg_x);
20302003
}
20312004

20322005
inline static float32x4_t ggml_v_tanh(float32x4_t x) {
@@ -2082,28 +2055,7 @@ inline static __m512 ggml_v_silu(__m512 x) {
20822055
inline static __m512 ggml_v_tanh(__m512 x) {
20832056
const __m512 one = _mm512_set1_ps(1.0f);
20842057
const __m512 exp_two_x = ggml_v_expf(_mm512_mul_ps(x, _mm512_set1_ps(2.f)));
2085-
// return _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
2086-
const __mmask16 mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(10.f), _CMP_GT_OQ);
2087-
const __m512 res = _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
2088-
return _mm512_mask_blend_ps(mask, res, one);
2089-
}
2090-
2091-
inline static __m512 ggml_v_softcap(__m512 x, __m512 s_before, __m512 s_after) {
2092-
const __m512 one = _mm512_set1_ps(1.0f);
2093-
const __m512 exp_two_x = ggml_v_expf(_mm512_mul_ps(x, s_before));
2094-
const __m512 th = _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
2095-
return _mm512_mul_ps(th, s_after);
2096-
}
2097-
2098-
inline static __m512 ggml_v_gelu(__m512 x, __m512 c1, __m512 c2) {
2099-
const __m512 one = _mm512_set1_ps(1.0f);
2100-
__m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
2101-
//__m512 arg = _mm512_add_ps(one, _mm512_mul_ps(_mm512_mul_ps(x, x), c1));
2102-
arg = _mm512_mul_ps(arg, _mm512_mul_ps(c2, x));
2103-
const __mmask16 mask = _mm512_cmp_ps_mask(arg, _mm512_set1_ps(30.f), _CMP_GT_OQ);
2104-
const __m512 exp_arg = ggml_v_expf(arg);
2105-
const __m512 ratio = _mm512_div_ps(exp_arg, _mm512_add_ps(exp_arg, one));
2106-
return _mm512_mul_ps(x, _mm512_mask_blend_ps(mask, ratio, one));
2058+
return _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one));
21072059
}
21082060

21092061
#elif defined(__AVX2__) && defined(__FMA__)
@@ -2164,28 +2116,7 @@ inline static __m256 ggml_v_silu(__m256 x) {
21642116
inline static __m256 ggml_v_tanh(__m256 x) {
21652117
const __m256 one = _mm256_set1_ps(1.0f);
21662118
const __m256 exp_two_x = ggml_v_expf(_mm256_mul_ps(x, _mm256_set1_ps(2.f)));
2167-
// return _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
2168-
const __m256 res = _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
2169-
const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
2170-
return _mm256_or_ps(_mm256_and_ps(mask, one), _mm256_andnot_ps(mask, res));
2171-
}
2172-
2173-
inline static __m256 ggml_v_softcap(__m256 x, float s_before, float s_after) {
2174-
return _mm256_mul_ps(_mm256_set1_ps(s_after), ggml_v_tanh(_mm256_mul_ps(x, _mm256_set1_ps(s_before))));
2175-
//const __m256 one = _mm256_set1_ps(1.0f);
2176-
//const __m256 exp_two_x = ggml_v_expf(_mm256_mul_ps(x, _mm256_set1_ps(2.f*s_before)));
2177-
//const __m256 th = _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
2178-
//return _mm256_mul_ps(th, _mm256_set1_ps(s_after));
2179-
}
2180-
2181-
inline static __m256 ggml_v_gelu(__m256 x, __m256 c1, __m256 c2) {
2182-
const __m256 one = _mm256_set1_ps(1.0f);
2183-
const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
2184-
__m256 arg = _mm256_add_ps(one, _mm256_mul_ps(_mm256_mul_ps(x, x), c1));
2185-
arg = _mm256_mul_ps(arg, _mm256_mul_ps(x, c2));
2186-
__m256 exp_arg = ggml_v_expf(arg);
2187-
__m256 gelu = _mm256_mul_ps(x, _mm256_div_ps(exp_arg, _mm256_add_ps(exp_arg, one)));
2188-
return _mm256_or_ps(_mm256_and_ps(mask, x), _mm256_andnot_ps(mask, gelu));
2119+
return _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one));
21892120
}
21902121

21912122
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
@@ -2248,13 +2179,6 @@ inline static __m128 ggml_v_tanh(__m128 x) {
22482179
return _mm_div_ps(_mm_sub_ps(exp_two_x, one), _mm_add_ps(exp_two_x, one));
22492180
}
22502181

2251-
inline static __m128 ggml_v_softcap(__m128 x, float s_before, float s_after) {
2252-
const __m128 one = _mm_set1_ps(1.0f);
2253-
const __m128 exp_two_x = ggml_v_expf(_mm_mul_ps(x, _mm_set1_ps(2.f*s_before)));
2254-
const __m128 th = _mm_div_ps(_mm_sub_ps(exp_two_x, one), _mm_add_ps(exp_two_x, one));
2255-
return _mm_mul_ps(th, _mm_set1_ps(s_after));
2256-
}
2257-
22582182
#endif // __ARM_NEON / __AVX2__ / __SSE2__
22592183

22602184
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
@@ -2314,108 +2238,6 @@ static void ggml_vec_tanh_f32(const int n, float * y, const float * x) {
23142238
}
23152239
}
23162240

2317-
static void ggml_vec_softcap_f32(const int n, float * x, float s_before, float s_after) {
2318-
int i = 0;
2319-
#if defined(__AVX512F__) && defined(__AVX512DQ__)
2320-
__m512 vs_before = _mm512_set1_ps(2.f*s_before);
2321-
__m512 vs_after = _mm512_set1_ps(s_after);
2322-
//for (; i + 63 < n; i += 64) {
2323-
// __m512 x1 = _mm512_loadu_ps(x + i);
2324-
// __m512 x2 = _mm512_loadu_ps(x + i + 16);
2325-
// __m512 x3 = _mm512_loadu_ps(x + i + 32);
2326-
// __m512 x4 = _mm512_loadu_ps(x + i + 48);
2327-
// _mm512_storeu_ps(x + i + 0, ggml_v_softcap(x1, vs_before, vs_after));
2328-
// _mm512_storeu_ps(x + i + 16, ggml_v_softcap(x2, vs_before, vs_after));
2329-
// _mm512_storeu_ps(x + i + 32, ggml_v_softcap(x3, vs_before, vs_after));
2330-
// _mm512_storeu_ps(x + i + 48, ggml_v_softcap(x4, vs_before, vs_after));
2331-
//}
2332-
for (; i + 15 < n; i += 16) {
2333-
_mm512_storeu_ps(x + i, ggml_v_softcap(_mm512_loadu_ps(x + i), vs_before, vs_after));
2334-
}
2335-
#elif defined(__AVX2__) && defined(__FMA__)
2336-
for (; i + 7 < n; i += 8) {
2337-
_mm256_storeu_ps(x + i, ggml_v_softcap(_mm256_loadu_ps(x + i), s_before, s_after));
2338-
}
2339-
#elif defined(__SSE2__)
2340-
for (; i + 3 < n; i += 4) {
2341-
_mm_storeu_ps(x + i, ggml_v_softcap(_mm_loadu_ps(x + i), s_before, s_after));
2342-
}
2343-
#elif defined(__ARM_NEON) && defined(__aarch64__)
2344-
float32x4_t vs_before = vdupq_n_f32(s_before);
2345-
float32x4_t vs_after = vdupq_n_f32(s_after);
2346-
for (; i + 3 < n; i += 4) {
2347-
vst1q_f32(x + i, ggml_v_softcap(vld1q_f32(x + i), vs_before, vs_after));
2348-
}
2349-
#endif
2350-
for (; i < n; ++i) {
2351-
x[i] = s_after*tanhf(x[i]*s_before);
2352-
}
2353-
}
2354-
2355-
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2356-
const uint16_t * i16 = (const uint16_t *) x;
2357-
for (int i = 0; i < n; ++i) {
2358-
y[i] = ggml_table_gelu_f16[i16[i]];
2359-
}
2360-
}
2361-
2362-
//
2363-
// On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
2364-
// via SIMD instructions is faster than the fp16-based lookup table.
2365-
// On my M2-Max CPU the lookup table is slightly faster than the SIMD version,
2366-
// hence we use the SIMD version only if GGML_GELU_FP16 is not defined.
2367-
// We do not run into numerical issues for large or small arguments because
2368-
// 0.5f * (1 + tanhf(arg))
2369-
// is computed as
2370-
// exp(2*arg)/(exp(2*arg) + 1)
2371-
// The ggml_v_expf functions flushes to zero for large enough negative
2372-
// arguments, so the above becomes zero. ggml_v_expf returns INFINITY
2373-
// for large positive arguments, so we would get a NaN if we did nothing. But in the
2374-
// ggml_v_gelu SIMD implementations we override the gelu result with the
2375-
// input argument when the argument is greater than 10, so it is all good.
2376-
//
2377-
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
2378-
int i = 0;
2379-
#if defined(__AVX512F__) && defined(__AVX512DQ__)
2380-
__m512 c1 = _mm512_set1_ps(GELU_COEF_A);
2381-
__m512 c2 = _mm512_set1_ps(2.f*SQRT_2_OVER_PI);
2382-
for (; i + 15 < n; i += 16) {
2383-
_mm512_storeu_ps(y + i, ggml_v_gelu(_mm512_loadu_ps(x + i), c1, c2));
2384-
}
2385-
#elif defined __AVX2__ && defined __FMA__
2386-
__m256 c1 = _mm256_set1_ps(GELU_COEF_A);
2387-
__m256 c2 = _mm256_set1_ps(2.f*SQRT_2_OVER_PI);
2388-
for (; i + 7 < n; i += 8) {
2389-
_mm256_storeu_ps(y + i, ggml_v_gelu(_mm256_loadu_ps(x + i), c1, c2));
2390-
}
2391-
#endif
2392-
#ifdef GGML_GELU_FP16
2393-
uint16_t t;
2394-
for (; i < n; ++i) {
2395-
if (x[i] <= -10.0f) {
2396-
y[i] = 0.0f;
2397-
} else if (x[i] >= 10.0f) {
2398-
y[i] = x[i];
2399-
} else {
2400-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
2401-
memcpy(&t, &fp16, sizeof(uint16_t));
2402-
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
2403-
}
2404-
}
2405-
#else
2406-
#if defined __ARM_NEON
2407-
float32x4_t c1 = vdupq_n_f32(GELU_COEF_A);
2408-
float32x4_t c2 = vdupq_n_f32(2.f*SQRT_2_OVER_PI);
2409-
for (; i + 3 < n; i += 4) {
2410-
vst1q_f32(y + i, ggml_v_gelu(vld1q_f32(x + i), c1, c2));
2411-
}
2412-
#endif
2413-
for (; i < n; ++i) {
2414-
y[i] = ggml_gelu_f32(x[i]);
2415-
}
2416-
#endif
2417-
}
2418-
24192241
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
24202242
int i = 0;
24212243
ggml_float sum = 0;
@@ -8598,71 +8420,6 @@ static void ggml_compute_forward_scale(
85988420
}
85998421
}
86008422

8601-
// ggml_compute_forward_softcap
8602-
8603-
static void ggml_compute_forward_softcap_f32(
8604-
const struct ggml_compute_params * params,
8605-
struct ggml_tensor * dst) {
8606-
8607-
const struct ggml_tensor * src0 = dst->src[0];
8608-
8609-
GGML_ASSERT(ggml_is_contiguous(src0));
8610-
GGML_ASSERT(ggml_is_contiguous(dst));
8611-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
8612-
8613-
// scale factor
8614-
float val[2];
8615-
memcpy(val, dst->op_params, sizeof(val));
8616-
8617-
const int ith = params->ith;
8618-
const int nth = params->nth;
8619-
8620-
const int nc = src0->ne[0];
8621-
const int nr = ggml_nrows(src0);
8622-
8623-
// rows per thread
8624-
const int dr = (nr + nth - 1)/nth;
8625-
8626-
// row range for this thread
8627-
const int ir0 = dr*ith;
8628-
const int ir1 = MIN(ir0 + dr, nr);
8629-
8630-
const size_t nb01 = src0->nb[1];
8631-
8632-
const size_t nb1 = dst->nb[1];
8633-
8634-
for (int i1 = ir0; i1 < ir1; i1++) {
8635-
if (dst->data != src0->data) {
8636-
// src0 is same shape as dst => same indices
8637-
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
8638-
}
8639-
// TODO: better implementation
8640-
float * row = (float *) ((char *) dst->data + i1*nb1);
8641-
ggml_vec_softcap_f32(nc, row, val[0], val[1]);
8642-
//ggml_vec_scale_f32(nc, row, val[0]);
8643-
//ggml_vec_tanh_f32(nc, row, row);
8644-
//ggml_vec_scale_f32(nc, row, val[1]);
8645-
}
8646-
}
8647-
8648-
static void ggml_compute_forward_softcap(
8649-
const struct ggml_compute_params * params,
8650-
struct ggml_tensor * dst) {
8651-
8652-
const struct ggml_tensor * src0 = dst->src[0];
8653-
8654-
switch (src0->type) {
8655-
case GGML_TYPE_F32:
8656-
{
8657-
ggml_compute_forward_softcap_f32(params, dst);
8658-
} break;
8659-
default:
8660-
{
8661-
GGML_ASSERT(false);
8662-
} break;
8663-
}
8664-
}
8665-
86668423
// ggml_compute_forward_set
86678424

86688425
static void ggml_compute_forward_set_f32(
@@ -12961,10 +12718,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1296112718
{
1296212719
ggml_compute_forward_scale(params, tensor);
1296312720
} break;
12964-
case GGML_OP_SOFTCAP:
12965-
{
12966-
ggml_compute_forward_softcap(params, tensor);
12967-
} break;
1296812721
case GGML_OP_SET:
1296912722
{
1297012723
ggml_compute_forward_set(params, tensor);
@@ -13372,7 +13125,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1337213125
n_tasks = 1;
1337313126
} break;
1337413127
case GGML_OP_SCALE:
13375-
case GGML_OP_SOFTCAP:
1337613128
case GGML_OP_SET:
1337713129
case GGML_OP_RESHAPE:
1337813130
case GGML_OP_VIEW:

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ bool g_mul_mat_q = false;
4040
#include "ggml-cuda/unary.cuh"
4141
#include "ggml-cuda/upscale.cuh"
4242
#include "ggml-cuda/wkv6.cuh"
43-
#include "ggml-cuda/softcap.cuh"
4443

4544
#include <algorithm>
4645
#include <array>
@@ -2313,9 +2312,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23132312
case GGML_OP_SCALE:
23142313
ggml_cuda_op_scale(ctx, dst);
23152314
break;
2316-
case GGML_OP_SOFTCAP:
2317-
ggml_cuda_op_softcap(ctx, dst);
2318-
break;
23192315
case GGML_OP_SQR:
23202316
ggml_cuda_op_sqr(ctx, dst);
23212317
break;
@@ -3208,7 +3204,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32083204
case GGML_OP_MUL:
32093205
case GGML_OP_DIV:
32103206
case GGML_OP_SCALE:
3211-
case GGML_OP_SOFTCAP:
32123207
case GGML_OP_SQR:
32133208
case GGML_OP_SQRT:
32143209
case GGML_OP_SIN:

0 commit comments

Comments
 (0)