@@ -124,30 +124,32 @@ static inline void vectorSlope512s(int *dst, int len, int offset, int slope)
124
124
int stop_len = len / (2 * AVX512_LEN_INT32 );
125
125
stop_len *= (2 * AVX512_LEN_INT32 );
126
126
127
- if (isAligned ((uintptr_t ) (dst ), AVX512_LEN_BYTES )) {
128
- _mm512_store_si512 ((__m512i * ) (dst + 0 ), curVal );
129
- _mm512_store_si512 ((__m512i * ) (dst + AVX512_LEN_INT32 ), curVal2 );
130
- } else {
131
- _mm512_storeu_si512 ((__m512i * ) (dst + 0 ), curVal );
132
- _mm512_storeu_si512 ((__m512i * ) (dst + AVX512_LEN_INT32 ), curVal2 );
133
- }
134
-
135
- if (isAligned ((uintptr_t ) (dst ), AVX512_LEN_BYTES )) {
136
- for (int i = 2 * AVX512_LEN_INT32 ; i < stop_len ; i += 2 * AVX512_LEN_INT32 ) {
137
- curVal = _mm512_add_epi32 (curVal , slope32_vec );
138
- _mm512_store_si512 ((__m512i * ) (dst + i ), curVal );
139
- curVal2 = _mm512_add_epi32 (curVal2 , slope32_vec );
140
- _mm512_store_si512 ((__m512i * ) (dst + i + AVX512_LEN_INT32 ), curVal2 );
141
- }
142
- } else {
143
- for (int i = 2 * AVX512_LEN_INT32 ; i < stop_len ; i += 2 * AVX512_LEN_INT32 ) {
144
- curVal = _mm512_add_epi32 (curVal , slope32_vec );
145
- _mm512_storeu_si512 ((__m512i * ) (dst + i ), curVal );
146
- curVal2 = _mm512_add_epi32 (curVal2 , slope32_vec );
147
- _mm512_storeu_si512 ((__m512i * ) (dst + i + AVX512_LEN_INT32 ), curVal2 );
148
- }
149
- }
150
-
127
+ if (len >= 2 * AVX512_LEN_INT32 ) {
128
+ if (isAligned ((uintptr_t ) (dst ), AVX512_LEN_BYTES )) {
129
+ _mm512_store_si512 ((__m512i * ) (dst + 0 ), curVal );
130
+ _mm512_store_si512 ((__m512i * ) (dst + AVX512_LEN_INT32 ), curVal2 );
131
+ } else {
132
+ _mm512_storeu_si512 ((__m512i * ) (dst + 0 ), curVal );
133
+ _mm512_storeu_si512 ((__m512i * ) (dst + AVX512_LEN_INT32 ), curVal2 );
134
+ }
135
+
136
+ if (isAligned ((uintptr_t ) (dst ), AVX512_LEN_BYTES )) {
137
+ for (int i = 2 * AVX512_LEN_INT32 ; i < stop_len ; i += 2 * AVX512_LEN_INT32 ) {
138
+ curVal = _mm512_add_epi32 (curVal , slope32_vec );
139
+ _mm512_store_si512 ((__m512i * ) (dst + i ), curVal );
140
+ curVal2 = _mm512_add_epi32 (curVal2 , slope32_vec );
141
+ _mm512_store_si512 ((__m512i * ) (dst + i + AVX512_LEN_INT32 ), curVal2 );
142
+ }
143
+ } else {
144
+ for (int i = 2 * AVX512_LEN_INT32 ; i < stop_len ; i += 2 * AVX512_LEN_INT32 ) {
145
+ curVal = _mm512_add_epi32 (curVal , slope32_vec );
146
+ _mm512_storeu_si512 ((__m512i * ) (dst + i ), curVal );
147
+ curVal2 = _mm512_add_epi32 (curVal2 , slope32_vec );
148
+ _mm512_storeu_si512 ((__m512i * ) (dst + i + AVX512_LEN_INT32 ), curVal2 );
149
+ }
150
+ }
151
+ }
152
+
151
153
for (int i = stop_len ; i < len ; i ++ ) {
152
154
dst [i ] = offset + slope * i ;
153
155
}
0 commit comments