@@ -1887,15 +1887,15 @@ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float *
18871887inline  static  void  ggml_vec_hardsigmoid_f32  (const  int  n , float  *  y , const  float  *  x ) { for  (int  i  =  0 ; i  <  n ; ++ i ) y [i ] =  fminf (1.0f , fmaxf (0.0f , (x [i ] +  3.0f ) / 6.0f )); }
18881888inline  static  void  ggml_vec_exp_f32  (const  int  n , float  *  y , const  float  *  x ) { for  (int  i  =  0 ; i  <  n ; ++ i ) y [i ] =  expf (x [i ]); }
18891889
1890- static  const  float  GELU_QUICK_COEF  =  -1.702f ;
18911890static  const  float  GELU_COEF_A      =  0.044715f ;
1891+ static  const  float  GELU_QUICK_COEF  =  -1.702f ;
18921892static  const  float  SQRT_2_OVER_PI   =  0.79788456080286535587989211986876f ;
18931893
18941894inline  static  float  ggml_gelu_f32 (float  x ) {
18951895    return  0.5f * x * (1.0f  +  tanhf (SQRT_2_OVER_PI * x * (1.0f  +  GELU_COEF_A * x * x )));
18961896}
18971897
1898- /*  inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1898+ inline  static  void  ggml_vec_gelu_f16 (const  int  n , ggml_fp16_t  *  y , const  ggml_fp16_t  *  x ) {
18991899    const  uint16_t  *  i16  =  (const  uint16_t  * ) x ;
19001900    for  (int  i  =  0 ; i  <  n ; ++ i ) {
19011901        y [i ] =  ggml_table_gelu_f16 [i16 [i ]];
@@ -1923,7 +1923,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
19231923        y [i ] =  ggml_gelu_f32 (x [i ]);
19241924    }
19251925}
1926- #endif */  
1926+ #endif 
19271927
19281928inline  static  float  ggml_gelu_quick_f32 (float  x ) {
19291929    return  x * (1.0f /(1.0f + expf (GELU_QUICK_COEF * x )));
@@ -1999,34 +1999,7 @@ inline static float32x4_t ggml_v_silu(float32x4_t x) {
19991999    const  float32x4_t  neg_x  =  vsubq_f32 (zero , x );
20002000    const  float32x4_t  exp_neg_x  =  ggml_v_expf (neg_x );
20012001    const  float32x4_t  one_plus_exp_neg_x  =  vaddq_f32 (one , exp_neg_x );
2002-     // return vdivq_f32(x, one_plus_exp_neg_x); 
2003-     const  uint32x4_t  mask  =  vcgtq_f32 (x , vdupq_n_f32 (10.f ));
2004-     const  float32x4_t  res  =  vdivq_f32 (vsubq_f32 (exp_two_x , one ), vaddq_f32 (exp_two_x , one ));
2005-     return  vreinterpretq_f32_u32 (vorrq_u32 (vandq_u32 (vreinterpretq_u32_f32 (one ), mask ), vbicq_u32 (vreinterpretq_u32_f32 (res ), mask )));
2006-     //return vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one)); 
2007- }
2008- 
2009- inline  static  float32x4_t  ggml_v_softcap (float32x4_t  x , float32x4_t  s_before , float32x4_t  s_after ) {
2010-     return  vmulq_f32 (s_after , ggml_v_tanh (vmulq_f32 (x , s_before )));
2011-     //const float32x4_t one = vdupq_n_f32(1.0f); 
2012-     //const float32x4_t two_x = vmulq_f32(x, s_before); 
2013-     //const float32x4_t exp_two_x = ggml_v_expf(two_x); 
2014-     //const float32x4_t th = vdivq_f32(vsubq_f32(exp_two_x, one), vaddq_f32(exp_two_x, one)); 
2015-     //return vmulq_f32(th, s_after); 
2016- }
2017- 
2018- 
2019- // Slower than lookup on my M2-Max 
2020- inline  static  float32x4_t  ggml_v_gelu (float32x4_t  x , float32x4_t  c1 , float32x4_t  c2 ) {
2021-     const  float32x4_t  one  =  vdupq_n_f32 (1.0f );
2022-     //float32x4_t arg = vaddq_f32(one, vmulq_f32(vmulq_f32(x, x), c1)); 
2023-     float32x4_t  arg  =  vfmaq_f32 (one , c1 , vmulq_f32 (x , x ));
2024-     arg  =  vmulq_f32 (arg , vmulq_f32 (x , c2 ));
2025-     float32x4_t  exp_arg  =  ggml_v_expf (arg );
2026-     float32x4_t  gelu  =  vmulq_f32 (x , vdivq_f32 (exp_arg , vaddq_f32 (exp_arg , one )));
2027-     uint32x4_t  mask  =  vcgtq_f32 (x , vdupq_n_f32 (10.f ));
2028-     return  vbslq_f32 (mask , x , gelu );
2029-     //return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(x), mask), vbicq_u32(vreinterpretq_u32_f32(gelu), mask))); 
2002+     return  vdivq_f32 (x , one_plus_exp_neg_x );
20302003}
20312004
20322005inline  static  float32x4_t  ggml_v_tanh (float32x4_t  x ) {
@@ -2082,28 +2055,7 @@ inline static __m512 ggml_v_silu(__m512 x) {
20822055inline  static  __m512  ggml_v_tanh (__m512  x ) {
20832056    const  __m512  one  =  _mm512_set1_ps (1.0f );
20842057    const  __m512  exp_two_x  =  ggml_v_expf (_mm512_mul_ps (x , _mm512_set1_ps (2.f )));
2085-     // return _mm512_div_ps(_mm512_sub_ps(exp_two_x, one), _mm512_add_ps(exp_two_x, one)); 
2086-     const  __mmask16  mask  =  _mm512_cmp_ps_mask (x , _mm512_set1_ps (10.f ), _CMP_GT_OQ );
2087-     const  __m512  res  =  _mm512_div_ps (_mm512_sub_ps (exp_two_x , one ), _mm512_add_ps (exp_two_x , one ));
2088-     return  _mm512_mask_blend_ps (mask , res , one );
2089- }
2090- 
2091- inline  static  __m512  ggml_v_softcap (__m512  x , __m512  s_before , __m512  s_after ) {
2092-     const  __m512  one  =  _mm512_set1_ps (1.0f );
2093-     const  __m512  exp_two_x  =  ggml_v_expf (_mm512_mul_ps (x , s_before ));
2094-     const  __m512  th  =  _mm512_div_ps (_mm512_sub_ps (exp_two_x , one ), _mm512_add_ps (exp_two_x , one ));
2095-     return  _mm512_mul_ps (th , s_after );
2096- }
2097- 
2098- inline  static  __m512  ggml_v_gelu (__m512  x , __m512  c1 , __m512  c2 ) {
2099-     const  __m512  one  =  _mm512_set1_ps (1.0f );
2100-     __m512  arg  =  _mm512_fmadd_ps (x , _mm512_mul_ps (c1 , x ), one );
2101-     //__m512 arg = _mm512_add_ps(one, _mm512_mul_ps(_mm512_mul_ps(x, x), c1)); 
2102-     arg  =  _mm512_mul_ps (arg , _mm512_mul_ps (c2 , x ));
2103-     const  __mmask16  mask  =  _mm512_cmp_ps_mask (arg , _mm512_set1_ps (30.f ), _CMP_GT_OQ );
2104-     const  __m512  exp_arg  =  ggml_v_expf (arg );
2105-     const  __m512  ratio  =  _mm512_div_ps (exp_arg , _mm512_add_ps (exp_arg , one ));
2106-     return  _mm512_mul_ps (x , _mm512_mask_blend_ps (mask , ratio , one ));
2058+     return  _mm512_div_ps (_mm512_sub_ps (exp_two_x , one ), _mm512_add_ps (exp_two_x , one ));
21072059}
21082060
21092061#elif  defined(__AVX2__ ) &&  defined(__FMA__ )
@@ -2164,28 +2116,7 @@ inline static __m256 ggml_v_silu(__m256 x) {
21642116inline  static  __m256  ggml_v_tanh (__m256  x ) {
21652117    const  __m256  one  =  _mm256_set1_ps (1.0f );
21662118    const  __m256  exp_two_x  =  ggml_v_expf (_mm256_mul_ps (x , _mm256_set1_ps (2.f )));
2167-     // return _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one)); 
2168-     const  __m256  res  =  _mm256_div_ps (_mm256_sub_ps (exp_two_x , one ), _mm256_add_ps (exp_two_x , one ));
2169-     const  __m256  mask  =  _mm256_cmp_ps (x , _mm256_set1_ps (10.f ), _CMP_GT_OQ );
2170-     return  _mm256_or_ps (_mm256_and_ps (mask , one ), _mm256_andnot_ps (mask , res ));
2171- }
2172- 
2173- inline  static  __m256  ggml_v_softcap (__m256  x , float  s_before , float  s_after ) {
2174-     return  _mm256_mul_ps (_mm256_set1_ps (s_after ), ggml_v_tanh (_mm256_mul_ps (x , _mm256_set1_ps (s_before ))));
2175-     //const __m256 one = _mm256_set1_ps(1.0f); 
2176-     //const __m256 exp_two_x = ggml_v_expf(_mm256_mul_ps(x, _mm256_set1_ps(2.f*s_before))); 
2177-     //const __m256 th = _mm256_div_ps(_mm256_sub_ps(exp_two_x, one), _mm256_add_ps(exp_two_x, one)); 
2178-     //return _mm256_mul_ps(th, _mm256_set1_ps(s_after)); 
2179- }
2180- 
2181- inline  static  __m256  ggml_v_gelu (__m256  x , __m256  c1 , __m256  c2 ) {
2182-     const  __m256  one  =  _mm256_set1_ps (1.0f );
2183-     const  __m256  mask  =  _mm256_cmp_ps (x , _mm256_set1_ps (10.f ), _CMP_GT_OQ );
2184-     __m256  arg  =  _mm256_add_ps (one , _mm256_mul_ps (_mm256_mul_ps (x , x ), c1 ));
2185-     arg  =  _mm256_mul_ps (arg , _mm256_mul_ps (x , c2 ));
2186-     __m256  exp_arg  =  ggml_v_expf (arg );
2187-     __m256  gelu  =  _mm256_mul_ps (x , _mm256_div_ps (exp_arg , _mm256_add_ps (exp_arg , one )));
2188-     return  _mm256_or_ps (_mm256_and_ps (mask , x ), _mm256_andnot_ps (mask , gelu ));
2119+     return  _mm256_div_ps (_mm256_sub_ps (exp_two_x , one ), _mm256_add_ps (exp_two_x , one ));
21892120}
21902121
21912122#elif  defined(__SSE2__ ) // __AVX2__ / __ARM_NEON 
@@ -2248,13 +2179,6 @@ inline static __m128 ggml_v_tanh(__m128 x) {
22482179    return  _mm_div_ps (_mm_sub_ps (exp_two_x , one ), _mm_add_ps (exp_two_x , one ));
22492180}
22502181
2251- inline  static  __m128  ggml_v_softcap (__m128  x , float  s_before , float  s_after ) {
2252-     const  __m128  one  =  _mm_set1_ps (1.0f );
2253-     const  __m128  exp_two_x  =  ggml_v_expf (_mm_mul_ps (x , _mm_set1_ps (2.f * s_before )));
2254-     const  __m128  th  =  _mm_div_ps (_mm_sub_ps (exp_two_x , one ), _mm_add_ps (exp_two_x , one ));
2255-     return  _mm_mul_ps (th , _mm_set1_ps (s_after ));
2256- }
2257- 
22582182#endif  // __ARM_NEON / __AVX2__ / __SSE2__ 
22592183
22602184static  void  ggml_vec_silu_f32 (const  int  n , float  *  y , const  float  *  x ) {
@@ -2314,108 +2238,6 @@ static void ggml_vec_tanh_f32(const int n, float * y, const float * x) {
23142238    }
23152239}
23162240
2317- static  void  ggml_vec_softcap_f32 (const  int  n , float  *  x , float  s_before , float  s_after ) {
2318-     int  i  =  0 ;
2319- #if  defined(__AVX512F__ ) &&  defined(__AVX512DQ__ )
2320-     __m512  vs_before  =  _mm512_set1_ps (2.f * s_before );
2321-     __m512  vs_after   =  _mm512_set1_ps (s_after );
2322-     //for (; i + 63 < n; i += 64) { 
2323-     //    __m512 x1 = _mm512_loadu_ps(x + i); 
2324-     //    __m512 x2 = _mm512_loadu_ps(x + i + 16); 
2325-     //    __m512 x3 = _mm512_loadu_ps(x + i + 32); 
2326-     //    __m512 x4 = _mm512_loadu_ps(x + i + 48); 
2327-     //    _mm512_storeu_ps(x + i +  0, ggml_v_softcap(x1, vs_before, vs_after)); 
2328-     //    _mm512_storeu_ps(x + i + 16, ggml_v_softcap(x2, vs_before, vs_after)); 
2329-     //    _mm512_storeu_ps(x + i + 32, ggml_v_softcap(x3, vs_before, vs_after)); 
2330-     //    _mm512_storeu_ps(x + i + 48, ggml_v_softcap(x4, vs_before, vs_after)); 
2331-     //} 
2332-     for  (; i  +  15  <  n ; i  +=  16 ) {
2333-         _mm512_storeu_ps (x  +  i , ggml_v_softcap (_mm512_loadu_ps (x  +  i ), vs_before , vs_after ));
2334-     }
2335- #elif  defined(__AVX2__ ) &&  defined(__FMA__ )
2336-     for  (; i  +  7  <  n ; i  +=  8 ) {
2337-         _mm256_storeu_ps (x  +  i , ggml_v_softcap (_mm256_loadu_ps (x  +  i ), s_before , s_after ));
2338-     }
2339- #elif  defined(__SSE2__ )
2340-     for  (; i  +  3  <  n ; i  +=  4 ) {
2341-         _mm_storeu_ps (x  +  i , ggml_v_softcap (_mm_loadu_ps (x  +  i ), s_before , s_after ));
2342-     }
2343- #elif  defined(__ARM_NEON ) &&  defined(__aarch64__ )
2344-     float32x4_t  vs_before  =  vdupq_n_f32 (s_before );
2345-     float32x4_t  vs_after   =  vdupq_n_f32 (s_after );
2346-     for  (; i  +  3  <  n ; i  +=  4 ) {
2347-         vst1q_f32 (x  +  i , ggml_v_softcap (vld1q_f32 (x  +  i ), vs_before , vs_after ));
2348-     }
2349- #endif 
2350-     for  (; i  <  n ; ++ i ) {
2351-         x [i ] =  s_after * tanhf (x [i ]* s_before );
2352-     }
2353- }
2354- 
2355- inline  static  void  ggml_vec_gelu_f16 (const  int  n , ggml_fp16_t  *  y , const  ggml_fp16_t  *  x ) {
2356-     const  uint16_t  *  i16  =  (const  uint16_t  * ) x ;
2357-     for  (int  i  =  0 ; i  <  n ; ++ i ) {
2358-         y [i ] =  ggml_table_gelu_f16 [i16 [i ]];
2359-     }
2360- }
2361- 
2362- // 
2363- // On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly 
2364- // via SIMD instructions is faster than the fp16-based lookup table. 
2365- // On my M2-Max CPU the lookup table is slightly faster than the SIMD version, 
2366- // hence we use the SIMD version only if GGML_GELU_FP16 is not defined. 
2367- // We do not run into numerical issues for large or small arguments because 
2368- //    0.5f * (1 + tanhf(arg)) 
2369- // is computed as 
2370- //    exp(2*arg)/(exp(2*arg) + 1) 
2371- // The ggml_v_expf functions flushes to zero for large enough negative 
2372- // arguments, so the above becomes zero. ggml_v_expf returns INFINITY 
2373- // for large positive arguments, so we would get a NaN if we did nothing. But in the 
2374- // ggml_v_gelu SIMD implementations we override the gelu result with the 
2375- // input argument when the argument is greater than 10, so it is all good. 
2376- // 
2377- inline  static  void  ggml_vec_gelu_f32 (const  int  n , float  *  y , const  float  *  x ) {
2378-     int  i  =  0 ;
2379- #if  defined(__AVX512F__ ) &&  defined(__AVX512DQ__ )
2380-     __m512  c1  =  _mm512_set1_ps (GELU_COEF_A );
2381-     __m512  c2  =  _mm512_set1_ps (2.f * SQRT_2_OVER_PI );
2382-     for  (; i  +  15  <  n ; i  +=  16 ) {
2383-         _mm512_storeu_ps (y  +  i , ggml_v_gelu (_mm512_loadu_ps (x  +  i ), c1 , c2 ));
2384-     }
2385- #elif  defined __AVX2__  &&  defined __FMA__ 
2386-     __m256  c1  =  _mm256_set1_ps (GELU_COEF_A );
2387-     __m256  c2  =  _mm256_set1_ps (2.f * SQRT_2_OVER_PI );
2388-     for  (; i  +  7  <  n ; i  +=  8 ) {
2389-         _mm256_storeu_ps (y  +  i , ggml_v_gelu (_mm256_loadu_ps (x  +  i ), c1 , c2 ));
2390-     }
2391- #endif 
2392- #ifdef  GGML_GELU_FP16 
2393-     uint16_t  t ;
2394-     for  (; i  <  n ; ++ i ) {
2395-         if  (x [i ] <= -10.0f ) {
2396-             y [i ] =  0.0f ;
2397-         } else  if  (x [i ] >= 10.0f ) {
2398-             y [i ] =  x [i ];
2399-         } else  {
2400-             ggml_fp16_t  fp16  =  GGML_FP32_TO_FP16 (x [i ]);
2401-             memcpy (& t , & fp16 , sizeof (uint16_t ));
2402-             y [i ] =  GGML_FP16_TO_FP32 (ggml_table_gelu_f16 [t ]);
2403-         }
2404-     }
2405- #else 
2406- #if  defined __ARM_NEON 
2407-     float32x4_t  c1  =  vdupq_n_f32 (GELU_COEF_A );
2408-     float32x4_t  c2  =  vdupq_n_f32 (2.f * SQRT_2_OVER_PI );
2409-     for  (; i  +  3  <  n ; i  +=  4 ) {
2410-         vst1q_f32 (y  +  i , ggml_v_gelu (vld1q_f32 (x  +  i ), c1 , c2 ));
2411-     }
2412- #endif 
2413-     for  (; i  <  n ; ++ i ) {
2414-         y [i ] =  ggml_gelu_f32 (x [i ]);
2415-     }
2416- #endif 
2417- }
2418- 
24192241static  ggml_float  ggml_vec_soft_max_f32 (const  int  n , float  *  y , const  float  *  x , float  max ) {
24202242    int  i  =  0 ;
24212243    ggml_float  sum  =  0 ;
@@ -8598,71 +8420,6 @@ static void ggml_compute_forward_scale(
85988420    }
85998421}
86008422
8601- // ggml_compute_forward_softcap 
8602- 
8603- static  void  ggml_compute_forward_softcap_f32 (
8604-         const  struct  ggml_compute_params  *  params ,
8605-         struct  ggml_tensor  *  dst ) {
8606- 
8607-     const  struct  ggml_tensor  *  src0  =  dst -> src [0 ];
8608- 
8609-     GGML_ASSERT (ggml_is_contiguous (src0 ));
8610-     GGML_ASSERT (ggml_is_contiguous (dst ));
8611-     GGML_ASSERT (ggml_are_same_shape (src0 , dst ));
8612- 
8613-     // scale factor 
8614-     float  val [2 ];
8615-     memcpy (val , dst -> op_params , sizeof (val ));
8616- 
8617-     const  int  ith  =  params -> ith ;
8618-     const  int  nth  =  params -> nth ;
8619- 
8620-     const  int  nc  =  src0 -> ne [0 ];
8621-     const  int  nr  =  ggml_nrows (src0 );
8622- 
8623-     // rows per thread 
8624-     const  int  dr  =  (nr  +  nth  -  1 )/nth ;
8625- 
8626-     // row range for this thread 
8627-     const  int  ir0  =  dr * ith ;
8628-     const  int  ir1  =  MIN (ir0  +  dr , nr );
8629- 
8630-     const  size_t  nb01  =  src0 -> nb [1 ];
8631- 
8632-     const  size_t  nb1  =  dst -> nb [1 ];
8633- 
8634-     for  (int  i1  =  ir0 ; i1  <  ir1 ; i1 ++ ) {
8635-         if  (dst -> data  !=  src0 -> data ) {
8636-             // src0 is same shape as dst => same indices 
8637-             memcpy ((char  * )dst -> data  +  i1 * nb1 , (char  * )src0 -> data  +  i1 * nb01 , nc  *  sizeof (float ));
8638-         }
8639-         // TODO: better implementation 
8640-         float  *  row  =  (float  * ) ((char  * ) dst -> data  +  i1 * nb1 );
8641-         ggml_vec_softcap_f32 (nc , row , val [0 ], val [1 ]);
8642-         //ggml_vec_scale_f32(nc, row, val[0]); 
8643-         //ggml_vec_tanh_f32(nc, row, row); 
8644-         //ggml_vec_scale_f32(nc, row, val[1]); 
8645-     }
8646- }
8647- 
8648- static  void  ggml_compute_forward_softcap (
8649-         const  struct  ggml_compute_params  *  params ,
8650-         struct  ggml_tensor  *  dst ) {
8651- 
8652-     const  struct  ggml_tensor  *  src0  =  dst -> src [0 ];
8653- 
8654-     switch  (src0 -> type ) {
8655-         case  GGML_TYPE_F32 :
8656-             {
8657-                 ggml_compute_forward_softcap_f32 (params , dst );
8658-             } break ;
8659-         default :
8660-             {
8661-                 GGML_ASSERT (false);
8662-             } break ;
8663-     }
8664- }
8665- 
86668423// ggml_compute_forward_set 
86678424
86688425static  void  ggml_compute_forward_set_f32 (
@@ -12961,10 +12718,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1296112718            {
1296212719                ggml_compute_forward_scale (params , tensor );
1296312720            } break ;
12964-         case  GGML_OP_SOFTCAP :
12965-             {
12966-                 ggml_compute_forward_softcap (params , tensor );
12967-             } break ;
1296812721        case  GGML_OP_SET :
1296912722            {
1297012723                ggml_compute_forward_set (params , tensor );
@@ -13372,7 +13125,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1337213125                n_tasks  =  1 ;
1337313126            } break ;
1337413127        case  GGML_OP_SCALE :
13375-         case  GGML_OP_SOFTCAP :
1337613128        case  GGML_OP_SET :
1337713129        case  GGML_OP_RESHAPE :
1337813130        case  GGML_OP_VIEW :
0 commit comments