Skip to content

Commit a4b2441

Browse files
committed
corrected small size vectorslope
1 parent 160c50f commit a4b2441

10 files changed

+84
-78
lines changed

simd_utils_altivec_float.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -3544,7 +3544,7 @@ static inline void vectorSlope128f(float *dst, int len, float offset, float slop
35443544
int stop_len = len / (2 * ALTIVEC_LEN_FLOAT);
35453545
stop_len *= (2 * ALTIVEC_LEN_FLOAT);
35463546

3547-
if (len >= ALTIVEC_LEN_BYTES) {
3547+
if (len >= 2*ALTIVEC_LEN_FLOAT) {
35483548
if (isAligned((uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
35493549
vec_st(curVal, 0, dst + 0);
35503550
vec_st(curVal2, 0, dst + ALTIVEC_LEN_FLOAT);

simd_utils_avx512_double.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ static inline void vectorSlope512d(double *dst, int len, double offset, double s
421421
int stop_len = len / (2 * AVX512_LEN_DOUBLE);
422422
stop_len *= (2 * AVX512_LEN_DOUBLE);
423423

424-
if (len >= AVX512_LEN_DOUBLE) {
424+
if (len >= 2*AVX512_LEN_DOUBLE) {
425425
if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
426426
_mm512_store_pd(dst + 0, curVal);
427427
_mm512_store_pd(dst + AVX512_LEN_DOUBLE, curVal2);

simd_utils_avx512_float.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,7 @@ static inline void vectorSlope512f(float *dst, int len, float offset, float slop
767767
int stop_len = len / (2 * AVX512_LEN_FLOAT);
768768
stop_len *= (2 * AVX512_LEN_FLOAT);
769769

770-
if (len >= AVX512_LEN_FLOAT) {
770+
if (len >= 2*AVX512_LEN_FLOAT) {
771771
if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
772772
_mm512_store_ps(dst + 0, curVal);
773773
_mm512_store_ps(dst + AVX512_LEN_FLOAT, curVal2);

simd_utils_avx512_int32.h

+26-24
Original file line numberDiff line numberDiff line change
@@ -124,30 +124,32 @@ static inline void vectorSlope512s(int *dst, int len, int offset, int slope)
124124
int stop_len = len / (2 * AVX512_LEN_INT32);
125125
stop_len *= (2 * AVX512_LEN_INT32);
126126

127-
if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
128-
_mm512_store_si512((__m512i *) (dst + 0), curVal);
129-
_mm512_store_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
130-
} else {
131-
_mm512_storeu_si512((__m512i *) (dst + 0), curVal);
132-
_mm512_storeu_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
133-
}
134-
135-
if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
136-
for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
137-
curVal = _mm512_add_epi32(curVal, slope32_vec);
138-
_mm512_store_si512((__m512i *) (dst + i), curVal);
139-
curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
140-
_mm512_store_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
141-
}
142-
} else {
143-
for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
144-
curVal = _mm512_add_epi32(curVal, slope32_vec);
145-
_mm512_storeu_si512((__m512i *) (dst + i), curVal);
146-
curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
147-
_mm512_storeu_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
148-
}
149-
}
150-
127+
if (len >= 2*AVX512_LEN_INT32) {
128+
if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
129+
_mm512_store_si512((__m512i *) (dst + 0), curVal);
130+
_mm512_store_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
131+
} else {
132+
_mm512_storeu_si512((__m512i *) (dst + 0), curVal);
133+
_mm512_storeu_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
134+
}
135+
136+
if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
137+
for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
138+
curVal = _mm512_add_epi32(curVal, slope32_vec);
139+
_mm512_store_si512((__m512i *) (dst + i), curVal);
140+
curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
141+
_mm512_store_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
142+
}
143+
} else {
144+
for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
145+
curVal = _mm512_add_epi32(curVal, slope32_vec);
146+
_mm512_storeu_si512((__m512i *) (dst + i), curVal);
147+
curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
148+
_mm512_storeu_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
149+
}
150+
}
151+
}
152+
151153
for (int i = stop_len; i < len; i++) {
152154
dst[i] = offset + slope * i;
153155
}

simd_utils_avx_double.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ static inline void vectorSlope256d(double *dst, int len, double offset, double s
422422
v4sd curVal2 = _mm256_add_pd(_mm256_set1_pd(offset), coef);
423423
curVal2 = _mm256_add_pd(curVal2, _mm256_set1_pd(4.0 * slope));
424424

425-
if (len >= AVX_LEN_DOUBLE) {
425+
if (len >= 2*AVX_LEN_DOUBLE) {
426426
if (isAligned((uintptr_t) (dst), AVX_LEN_BYTES)) {
427427
_mm256_store_pd(dst + 0, curVal);
428428
_mm256_store_pd(dst + AVX_LEN_DOUBLE, curVal2);

simd_utils_avx_float.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ static inline void vectorSlope256f(float *dst, int len, float offset, float slop
865865
int stop_len = len / (2 * AVX_LEN_FLOAT);
866866
stop_len *= (2 * AVX_LEN_FLOAT);
867867

868-
if (len >= AVX_LEN_FLOAT) {
868+
if (len >= 2*AVX_LEN_FLOAT) {
869869
if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
870870
_mm256_store_ps(dst + 0, curVal);
871871
_mm256_store_ps(dst + AVX_LEN_FLOAT, curVal2);

simd_utils_avx_int32.h

+25-23
Original file line numberDiff line numberDiff line change
@@ -121,29 +121,31 @@ static inline void vectorSlope256s(int *dst, int len, int offset, int slope)
121121
int stop_len = len / (2 * AVX_LEN_INT32);
122122
stop_len *= (2 * AVX_LEN_INT32);
123123

124-
if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
125-
_mm256_storeu_si256((__m256i *) (dst + 0), curVal);
126-
_mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
127-
} else {
128-
_mm256_storeu_si256((__m256i *) (dst + 0), curVal);
129-
_mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
130-
}
131-
132-
if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
133-
for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
134-
curVal = _mm256_add_epi32(curVal, slope16_vec);
135-
_mm256_store_si256((__m256i *) (dst + i), curVal);
136-
curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
137-
_mm256_store_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
138-
}
139-
} else {
140-
for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
141-
curVal = _mm256_add_epi32(curVal, slope16_vec);
142-
_mm256_storeu_si256((__m256i *) (dst + i), curVal);
143-
curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
144-
_mm256_storeu_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
145-
}
146-
}
124+
if(len >= 2*AVX_LEN_INT32){
125+
if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
126+
_mm256_storeu_si256((__m256i *) (dst + 0), curVal);
127+
_mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
128+
} else {
129+
_mm256_storeu_si256((__m256i *) (dst + 0), curVal);
130+
_mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
131+
}
132+
133+
if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
134+
for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
135+
curVal = _mm256_add_epi32(curVal, slope16_vec);
136+
_mm256_store_si256((__m256i *) (dst + i), curVal);
137+
curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
138+
_mm256_store_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
139+
}
140+
} else {
141+
for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
142+
curVal = _mm256_add_epi32(curVal, slope16_vec);
143+
_mm256_storeu_si256((__m256i *) (dst + i), curVal);
144+
curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
145+
_mm256_storeu_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
146+
}
147+
}
148+
}
147149

148150
for (int i = stop_len; i < len; i++) {
149151
dst[i] = offset + slope * i;

simd_utils_sse_double.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ static inline void vectorSlope128d(double *dst, int len, double offset, double s
425425
v2sd curVal2 = _mm_add_pd(_mm_set1_pd(offset), coef);
426426
curVal2 = _mm_add_pd(curVal2, _mm_set1_pd(2.0 * slope));
427427

428-
if (len >= SSE_LEN_DOUBLE) {
428+
if (len >= 2*SSE_LEN_DOUBLE) {
429429
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
430430
_mm_store_pd(dst + 0, curVal);
431431
_mm_store_pd(dst + SSE_LEN_DOUBLE, curVal2);

simd_utils_sse_float.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -914,7 +914,7 @@ static inline void vectorSlope128f(float *dst, int len, float offset, float slop
914914
int stop_len = len / (2 * SSE_LEN_FLOAT);
915915
stop_len *= (2 * SSE_LEN_FLOAT);
916916

917-
if (len >= SSE_LEN_BYTES) {
917+
if (len >= 2*SSE_LEN_FLOAT) {
918918
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
919919
_mm_store_ps(dst + 0, curVal);
920920
_mm_store_ps(dst + SSE_LEN_FLOAT, curVal2);

simd_utils_sse_int32.h

+26-24
Original file line numberDiff line numberDiff line change
@@ -126,30 +126,32 @@ static inline void vectorSlope128s(int *dst, int len, int offset, int slope)
126126
int stop_len = len / (2 * SSE_LEN_INT32);
127127
stop_len *= (2 * SSE_LEN_INT32);
128128

129-
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
130-
_mm_store_si128((__m128i *) dst, curVal);
131-
_mm_store_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
132-
} else {
133-
_mm_storeu_si128((__m128i *) dst, curVal);
134-
_mm_storeu_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
135-
}
136-
137-
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
138-
for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
139-
curVal = _mm_add_epi32(curVal, slope8_vec);
140-
_mm_store_si128((__m128i *) (dst + i), curVal);
141-
curVal2 = _mm_add_epi32(curVal2, slope8_vec);
142-
_mm_store_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
143-
}
144-
} else {
145-
for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
146-
curVal = _mm_add_epi32(curVal, slope8_vec);
147-
_mm_storeu_si128((__m128i *) (dst + i), curVal);
148-
curVal2 = _mm_add_epi32(curVal2, slope8_vec);
149-
_mm_storeu_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
150-
}
151-
}
152-
129+
if (len >= 2*SSE_LEN_INT32) {
130+
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
131+
_mm_store_si128((__m128i *) dst, curVal);
132+
_mm_store_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
133+
} else {
134+
_mm_storeu_si128((__m128i *) dst, curVal);
135+
_mm_storeu_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
136+
}
137+
138+
if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
139+
for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
140+
curVal = _mm_add_epi32(curVal, slope8_vec);
141+
_mm_store_si128((__m128i *) (dst + i), curVal);
142+
curVal2 = _mm_add_epi32(curVal2, slope8_vec);
143+
_mm_store_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
144+
}
145+
} else {
146+
for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
147+
curVal = _mm_add_epi32(curVal, slope8_vec);
148+
_mm_storeu_si128((__m128i *) (dst + i), curVal);
149+
curVal2 = _mm_add_epi32(curVal2, slope8_vec);
150+
_mm_storeu_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
151+
}
152+
}
153+
}
154+
153155
for (int i = stop_len; i < len; i++) {
154156
dst[i] = offset + slope * i;
155157
}

0 commit comments

Comments
 (0)