@@ -1613,14 +1613,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
16131613#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
16141614#define GGML_F32x4_REDUCE(res, x) \
16151615{ \
1616- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1617- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1616+ int offset = GGML_F32_ARR >> 1; \
1617+ for (int i = 0; i < offset; ++i) { \
1618+ x[i] = vaddq_f32(x[i], x[offset+i]); \
16181619 } \
1619- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1620- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1620+ offset >>= 1; \
1621+ for (int i = 0; i < offset; ++i) { \
1622+ x[i] = vaddq_f32(x[i], x[offset+i]); \
16211623 } \
1622- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1623- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1624+ offset >>= 1; \
1625+ for (int i = 0; i < offset; ++i) { \
1626+ x[i] = vaddq_f32(x[i], x[offset+i]); \
16241627 } \
16251628 res = GGML_F32x4_REDUCE_ONE(x[0]); \
16261629}
@@ -1651,14 +1654,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
16511654 #define GGML_F16x8_MUL vmulq_f16
16521655 #define GGML_F16x8_REDUCE(res, x) \
16531656 { \
1654- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1655- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1657+ int offset = GGML_F16_ARR >> 1; \
1658+ for (int i = 0; i < offset; ++i) { \
1659+ x[i] = vaddq_f16(x[i], x[offset+i]); \
16561660 } \
1657- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1658- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1661+ offset >>= 1; \
1662+ for (int i = 0; i < offset; ++i) { \
1663+ x[i] = vaddq_f16(x[i], x[offset+i]); \
16591664 } \
1660- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1661- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1665+ offset >>= 1; \
1666+ for (int i = 0; i < offset; ++i) { \
1667+ x[i] = vaddq_f16(x[i], x[offset+i]); \
16621668 } \
16631669 const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
16641670 const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1725,14 +1731,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
17251731#define GGML_F32x8_MUL _mm256_mul_ps
17261732#define GGML_F32x8_REDUCE(res, x) \
17271733{ \
1728- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1729- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1734+ int offset = GGML_F32_ARR >> 1; \
1735+ for (int i = 0; i < offset; ++i) { \
1736+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
17301737 } \
1731- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1732- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1738+ offset >>= 1; \
1739+ for (int i = 0; i < offset; ++i) { \
1740+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
17331741 } \
1734- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1735- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1742+ offset >>= 1; \
1743+ for (int i = 0; i < offset; ++i) { \
1744+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
17361745 } \
17371746 const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
17381747 _mm256_extractf128_ps(x[0], 1)); \
@@ -1822,14 +1831,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
18221831#define GGML_F32x4_MUL vec_mul
18231832#define GGML_F32x4_REDUCE(res, x) \
18241833{ \
1825- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1826- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1834+ int offset = GGML_F32_ARR >> 1; \
1835+ for (int i = 0; i < offset; ++i) { \
1836+ x[i] = vec_add(x[i], x[offset+i]); \
18271837 } \
1828- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1829- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1838+ offset >>= 1; \
1839+ for (int i = 0; i < offset; ++i) { \
1840+ x[i] = vec_add(x[i], x[offset+i]); \
18301841 } \
1831- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1832- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1842+ offset >>= 1; \
1843+ for (int i = 0; i < offset; ++i) { \
1844+ x[i] = vec_add(x[i], x[offset+i]); \
18331845 } \
18341846 res = vec_extract(x[0], 0) + \
18351847 vec_extract(x[0], 1) + \
@@ -1885,14 +1897,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
18851897#define GGML_F32x4_MUL wasm_f32x4_mul
18861898#define GGML_F32x4_REDUCE(res, x) \
18871899{ \
1888- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1889- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1900+ int offset = GGML_F32_ARR >> 1; \
1901+ for (int i = 0; i < offset; ++i) { \
1902+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
18901903 } \
1891- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1892- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1904+ offset >>= 1; \
1905+ for (int i = 0; i < offset; ++i) { \
1906+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
18931907 } \
1894- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1895- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1908+ offset >>= 1; \
1909+ for (int i = 0; i < offset; ++i) { \
1910+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
18961911 } \
18971912 res = wasm_f32x4_extract_lane(x[0], 0) + \
18981913 wasm_f32x4_extract_lane(x[0], 1) + \
@@ -1947,14 +1962,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
19471962#define GGML_F16x4_MUL wasm_f32x4_mul
19481963#define GGML_F16x4_REDUCE(res, x) \
19491964{ \
1950- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1951- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1965+ int offset = GGML_F16_ARR >> 1; \
1966+ for (int i = 0; i < offset; ++i) { \
1967+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
19521968 } \
1953- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1954- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1969+ offset >>= 1; \
1970+ for (int i = 0; i < offset; ++i) { \
1971+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
19551972 } \
1956- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1957- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1973+ offset >>= 1; \
1974+ for (int i = 0; i < offset; ++i) { \
1975+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
19581976 } \
19591977 res = wasm_f32x4_extract_lane(x[0], 0) + \
19601978 wasm_f32x4_extract_lane(x[0], 1) + \
@@ -1996,14 +2014,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
19962014#define GGML_F32x4_MUL _mm_mul_ps
19972015#define GGML_F32x4_REDUCE(res, x) \
19982016{ \
1999- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2000- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2017+ int offset = GGML_F32_ARR >> 1; \
2018+ for (int i = 0; i < offset; ++i) { \
2019+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
20012020 } \
2002- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2003- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2021+ offset >>= 1; \
2022+ for (int i = 0; i < offset; ++i) { \
2023+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
20042024 } \
2005- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2006- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2025+ offset >>= 1; \
2026+ for (int i = 0; i < offset; ++i) { \
2027+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
20072028 } \
20082029 const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
20092030 res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
0 commit comments