corrected small size vectorslope

JishinMaster · JishinMaster · commit a4b2441d19ae · 2024-09-05T11:40:25.000+02:00
diff --git a/simd_utils_altivec_float.h b/simd_utils_altivec_float.h
@@ -3544,7 +3544,7 @@ static inline void vectorSlope128f(float *dst, int len, float offset, float slop
     int stop_len = len / (2 * ALTIVEC_LEN_FLOAT);
     stop_len *= (2 * ALTIVEC_LEN_FLOAT);
 
-    if (len >= ALTIVEC_LEN_BYTES) {
+    if (len >= 2*ALTIVEC_LEN_FLOAT) {
         if (isAligned((uintptr_t) (dst), ALTIVEC_LEN_BYTES)) {
             vec_st(curVal, 0, dst + 0);
             vec_st(curVal2, 0, dst + ALTIVEC_LEN_FLOAT);
diff --git a/simd_utils_avx512_double.h b/simd_utils_avx512_double.h
@@ -421,7 +421,7 @@ static inline void vectorSlope512d(double *dst, int len, double offset, double s
     int stop_len = len / (2 * AVX512_LEN_DOUBLE);
     stop_len *= (2 * AVX512_LEN_DOUBLE);
 
-    if (len >= AVX512_LEN_DOUBLE) {
+    if (len >= 2*AVX512_LEN_DOUBLE) {
         if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
             _mm512_store_pd(dst + 0, curVal);
             _mm512_store_pd(dst + AVX512_LEN_DOUBLE, curVal2);
diff --git a/simd_utils_avx512_float.h b/simd_utils_avx512_float.h
@@ -767,7 +767,7 @@ static inline void vectorSlope512f(float *dst, int len, float offset, float slop
     int stop_len = len / (2 * AVX512_LEN_FLOAT);
     stop_len *= (2 * AVX512_LEN_FLOAT);
 
-    if (len >= AVX512_LEN_FLOAT) {
+    if (len >= 2*AVX512_LEN_FLOAT) {
         if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
             _mm512_store_ps(dst + 0, curVal);
             _mm512_store_ps(dst + AVX512_LEN_FLOAT, curVal2);
diff --git a/simd_utils_avx512_int32.h b/simd_utils_avx512_int32.h
@@ -124,30 +124,32 @@ static inline void vectorSlope512s(int *dst, int len, int offset, int slope)
     int stop_len = len / (2 * AVX512_LEN_INT32);
     stop_len *= (2 * AVX512_LEN_INT32);
 
-    if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
-        _mm512_store_si512((__m512i *) (dst + 0), curVal);
-        _mm512_store_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
-    } else {
-        _mm512_storeu_si512((__m512i *) (dst + 0), curVal);
-        _mm512_storeu_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
-    }
-
-    if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
-        for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
-            curVal = _mm512_add_epi32(curVal, slope32_vec);
-            _mm512_store_si512((__m512i *) (dst + i), curVal);
-            curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
-            _mm512_store_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
-        }
-    } else {
-        for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
-            curVal = _mm512_add_epi32(curVal, slope32_vec);
-            _mm512_storeu_si512((__m512i *) (dst + i), curVal);
-            curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
-            _mm512_storeu_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
-        }
-    }
-
+    if (len >= 2*AVX512_LEN_INT32) {
+		if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
+			_mm512_store_si512((__m512i *) (dst + 0), curVal);
+			_mm512_store_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
+		} else {
+			_mm512_storeu_si512((__m512i *) (dst + 0), curVal);
+			_mm512_storeu_si512((__m512i *) (dst + AVX512_LEN_INT32), curVal2);
+		}
+
+		if (isAligned((uintptr_t) (dst), AVX512_LEN_BYTES)) {
+			for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
+				curVal = _mm512_add_epi32(curVal, slope32_vec);
+				_mm512_store_si512((__m512i *) (dst + i), curVal);
+				curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
+				_mm512_store_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
+			}
+		} else {
+			for (int i = 2 * AVX512_LEN_INT32; i < stop_len; i += 2 * AVX512_LEN_INT32) {
+				curVal = _mm512_add_epi32(curVal, slope32_vec);
+				_mm512_storeu_si512((__m512i *) (dst + i), curVal);
+				curVal2 = _mm512_add_epi32(curVal2, slope32_vec);
+				_mm512_storeu_si512((__m512i *) (dst + i + AVX512_LEN_INT32), curVal2);
+			}
+		}
+	}
+	
     for (int i = stop_len; i < len; i++) {
         dst[i] = offset + slope * i;
     }
diff --git a/simd_utils_avx_double.h b/simd_utils_avx_double.h
@@ -422,7 +422,7 @@ static inline void vectorSlope256d(double *dst, int len, double offset, double s
     v4sd curVal2 = _mm256_add_pd(_mm256_set1_pd(offset), coef);
     curVal2 = _mm256_add_pd(curVal2, _mm256_set1_pd(4.0 * slope));
 
-    if (len >= AVX_LEN_DOUBLE) {
+    if (len >= 2*AVX_LEN_DOUBLE) {
         if (isAligned((uintptr_t) (dst), AVX_LEN_BYTES)) {
             _mm256_store_pd(dst + 0, curVal);
             _mm256_store_pd(dst + AVX_LEN_DOUBLE, curVal2);
diff --git a/simd_utils_avx_float.h b/simd_utils_avx_float.h
@@ -865,7 +865,7 @@ static inline void vectorSlope256f(float *dst, int len, float offset, float slop
     int stop_len = len / (2 * AVX_LEN_FLOAT);
     stop_len *= (2 * AVX_LEN_FLOAT);
 
-    if (len >= AVX_LEN_FLOAT) {
+    if (len >= 2*AVX_LEN_FLOAT) {
         if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
             _mm256_store_ps(dst + 0, curVal);
             _mm256_store_ps(dst + AVX_LEN_FLOAT, curVal2);
diff --git a/simd_utils_avx_int32.h b/simd_utils_avx_int32.h
@@ -121,29 +121,31 @@ static inline void vectorSlope256s(int *dst, int len, int offset, int slope)
     int stop_len = len / (2 * AVX_LEN_INT32);
     stop_len *= (2 * AVX_LEN_INT32);
 
-    if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
-        _mm256_storeu_si256((__m256i *) (dst + 0), curVal);
-        _mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
-    } else {
-        _mm256_storeu_si256((__m256i *) (dst + 0), curVal);
-        _mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
-    }
-
-    if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
-        for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
-            curVal = _mm256_add_epi32(curVal, slope16_vec);
-            _mm256_store_si256((__m256i *) (dst + i), curVal);
-            curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
-            _mm256_store_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
-        }
-    } else {
-        for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
-            curVal = _mm256_add_epi32(curVal, slope16_vec);
-            _mm256_storeu_si256((__m256i *) (dst + i), curVal);
-            curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
-            _mm256_storeu_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
-        }
-    }
+	if(len >= 2*AVX_LEN_INT32){
+		if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
+			_mm256_storeu_si256((__m256i *) (dst + 0), curVal);
+			_mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
+		} else {
+			_mm256_storeu_si256((__m256i *) (dst + 0), curVal);
+			_mm256_storeu_si256((__m256i *) (dst + AVX_LEN_INT32), curVal2);
+		}
+
+		if (((uintptr_t) (const void *) (dst) % AVX_LEN_BYTES) == 0) {
+			for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
+				curVal = _mm256_add_epi32(curVal, slope16_vec);
+				_mm256_store_si256((__m256i *) (dst + i), curVal);
+				curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
+				_mm256_store_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
+			}
+		} else {
+			for (int i = 2 * AVX_LEN_INT32; i < stop_len; i += 2 * AVX_LEN_INT32) {
+				curVal = _mm256_add_epi32(curVal, slope16_vec);
+				_mm256_storeu_si256((__m256i *) (dst + i), curVal);
+				curVal2 = _mm256_add_epi32(curVal2, slope16_vec);
+				_mm256_storeu_si256((__m256i *) (dst + i + AVX_LEN_INT32), curVal2);
+			}
+		}
+	}
 
     for (int i = stop_len; i < len; i++) {
         dst[i] = offset + slope * i;
diff --git a/simd_utils_sse_double.h b/simd_utils_sse_double.h
@@ -425,7 +425,7 @@ static inline void vectorSlope128d(double *dst, int len, double offset, double s
     v2sd curVal2 = _mm_add_pd(_mm_set1_pd(offset), coef);
     curVal2 = _mm_add_pd(curVal2, _mm_set1_pd(2.0 * slope));
 
-    if (len >= SSE_LEN_DOUBLE) {
+    if (len >= 2*SSE_LEN_DOUBLE) {
         if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
             _mm_store_pd(dst + 0, curVal);
             _mm_store_pd(dst + SSE_LEN_DOUBLE, curVal2);
diff --git a/simd_utils_sse_float.h b/simd_utils_sse_float.h
@@ -914,7 +914,7 @@ static inline void vectorSlope128f(float *dst, int len, float offset, float slop
     int stop_len = len / (2 * SSE_LEN_FLOAT);
     stop_len *= (2 * SSE_LEN_FLOAT);
 
-    if (len >= SSE_LEN_BYTES) {
+    if (len >= 2*SSE_LEN_FLOAT) {
         if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
             _mm_store_ps(dst + 0, curVal);
             _mm_store_ps(dst + SSE_LEN_FLOAT, curVal2);
diff --git a/simd_utils_sse_int32.h b/simd_utils_sse_int32.h
@@ -126,30 +126,32 @@ static inline void vectorSlope128s(int *dst, int len, int offset, int slope)
     int stop_len = len / (2 * SSE_LEN_INT32);
     stop_len *= (2 * SSE_LEN_INT32);
 
-    if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
-        _mm_store_si128((__m128i *) dst, curVal);
-        _mm_store_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
-    } else {
-        _mm_storeu_si128((__m128i *) dst, curVal);
-        _mm_storeu_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
-    }
-
-    if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
-        for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
-            curVal = _mm_add_epi32(curVal, slope8_vec);
-            _mm_store_si128((__m128i *) (dst + i), curVal);
-            curVal2 = _mm_add_epi32(curVal2, slope8_vec);
-            _mm_store_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
-        }
-    } else {
-        for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
-            curVal = _mm_add_epi32(curVal, slope8_vec);
-            _mm_storeu_si128((__m128i *) (dst + i), curVal);
-            curVal2 = _mm_add_epi32(curVal2, slope8_vec);
-            _mm_storeu_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
-        }
-    }
-
+    if (len >= 2*SSE_LEN_INT32) {
+		if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
+			_mm_store_si128((__m128i *) dst, curVal);
+			_mm_store_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
+		} else {
+			_mm_storeu_si128((__m128i *) dst, curVal);
+			_mm_storeu_si128((__m128i *) (dst + SSE_LEN_INT32), curVal2);
+		}
+
+		if (isAligned((uintptr_t) (dst), SSE_LEN_BYTES)) {
+			for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
+				curVal = _mm_add_epi32(curVal, slope8_vec);
+				_mm_store_si128((__m128i *) (dst + i), curVal);
+				curVal2 = _mm_add_epi32(curVal2, slope8_vec);
+				_mm_store_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
+			}
+		} else {
+			for (int i = 2 * SSE_LEN_INT32; i < stop_len; i += 2 * SSE_LEN_INT32) {
+				curVal = _mm_add_epi32(curVal, slope8_vec);
+				_mm_storeu_si128((__m128i *) (dst + i), curVal);
+				curVal2 = _mm_add_epi32(curVal2, slope8_vec);
+				_mm_storeu_si128((__m128i *) (dst + i + SSE_LEN_INT32), curVal2);
+			}
+		}
+	}
+	
     for (int i = stop_len; i < len; i++) {
         dst[i] = offset + slope * i;
     }