@@ -175,64 +175,102 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
175
175
176
176
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
177
177
// that's ok since the histogram values are less than 1<<28 (max picture size).
178
- #define LINE_SIZE 16 // 8 or 16
179
178
static void AddVector_SSE2 (const uint32_t * WEBP_RESTRICT a ,
180
179
const uint32_t * WEBP_RESTRICT b ,
181
180
uint32_t * WEBP_RESTRICT out , int size ) {
182
- int i ;
183
- for (i = 0 ; i + LINE_SIZE <= size ; i += LINE_SIZE ) {
181
+ int i = 0 ;
182
+ int aligned_size = size & ~15 ;
183
+ // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
184
+ // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
185
+ // 2). See the usage in VP8LHistogramAdd().
186
+ assert (size >= 16 );
187
+ assert (size % 2 == 0 );
188
+
189
+ do {
184
190
const __m128i a0 = _mm_loadu_si128 ((const __m128i * )& a [i + 0 ]);
185
191
const __m128i a1 = _mm_loadu_si128 ((const __m128i * )& a [i + 4 ]);
186
- #if (LINE_SIZE == 16 )
187
192
const __m128i a2 = _mm_loadu_si128 ((const __m128i * )& a [i + 8 ]);
188
193
const __m128i a3 = _mm_loadu_si128 ((const __m128i * )& a [i + 12 ]);
189
- #endif
190
194
const __m128i b0 = _mm_loadu_si128 ((const __m128i * )& b [i + 0 ]);
191
195
const __m128i b1 = _mm_loadu_si128 ((const __m128i * )& b [i + 4 ]);
192
- #if (LINE_SIZE == 16 )
193
196
const __m128i b2 = _mm_loadu_si128 ((const __m128i * )& b [i + 8 ]);
194
197
const __m128i b3 = _mm_loadu_si128 ((const __m128i * )& b [i + 12 ]);
195
- #endif
196
198
_mm_storeu_si128 ((__m128i * )& out [i + 0 ], _mm_add_epi32 (a0 , b0 ));
197
199
_mm_storeu_si128 ((__m128i * )& out [i + 4 ], _mm_add_epi32 (a1 , b1 ));
198
- #if (LINE_SIZE == 16 )
199
200
_mm_storeu_si128 ((__m128i * )& out [i + 8 ], _mm_add_epi32 (a2 , b2 ));
200
201
_mm_storeu_si128 ((__m128i * )& out [i + 12 ], _mm_add_epi32 (a3 , b3 ));
201
- #endif
202
+ i += 16 ;
203
+ } while (i != aligned_size );
204
+
205
+ if ((size & 8 ) != 0 ) {
206
+ const __m128i a0 = _mm_loadu_si128 ((const __m128i * )& a [i + 0 ]);
207
+ const __m128i a1 = _mm_loadu_si128 ((const __m128i * )& a [i + 4 ]);
208
+ const __m128i b0 = _mm_loadu_si128 ((const __m128i * )& b [i + 0 ]);
209
+ const __m128i b1 = _mm_loadu_si128 ((const __m128i * )& b [i + 4 ]);
210
+ _mm_storeu_si128 ((__m128i * )& out [i + 0 ], _mm_add_epi32 (a0 , b0 ));
211
+ _mm_storeu_si128 ((__m128i * )& out [i + 4 ], _mm_add_epi32 (a1 , b1 ));
212
+ i += 8 ;
202
213
}
203
- for (; i < size ; ++ i ) {
204
- out [i ] = a [i ] + b [i ];
214
+
215
+ size &= 7 ;
216
+ if (size == 4 ) {
217
+ const __m128i a0 = _mm_loadu_si128 ((const __m128i * )& a [i ]);
218
+ const __m128i b0 = _mm_loadu_si128 ((const __m128i * )& b [i ]);
219
+ _mm_storeu_si128 ((__m128i * )& out [i ], _mm_add_epi32 (a0 , b0 ));
220
+ } else if (size == 2 ) {
221
+ const __m128i a0 = _mm_loadl_epi64 ((const __m128i * )& a [i ]);
222
+ const __m128i b0 = _mm_loadl_epi64 ((const __m128i * )& b [i ]);
223
+ _mm_storel_epi64 ((__m128i * )& out [i ], _mm_add_epi32 (a0 , b0 ));
205
224
}
206
225
}
207
226
208
227
static void AddVectorEq_SSE2 (const uint32_t * WEBP_RESTRICT a ,
209
228
uint32_t * WEBP_RESTRICT out , int size ) {
210
- int i ;
211
- for (i = 0 ; i + LINE_SIZE <= size ; i += LINE_SIZE ) {
229
+ int i = 0 ;
230
+ int aligned_size = size & ~15 ;
231
+ // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
232
+ // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
233
+ // 2). See the usage in VP8LHistogramAdd().
234
+ assert (size >= 16 );
235
+ assert (size % 2 == 0 );
236
+
237
+ do {
212
238
const __m128i a0 = _mm_loadu_si128 ((const __m128i * )& a [i + 0 ]);
213
239
const __m128i a1 = _mm_loadu_si128 ((const __m128i * )& a [i + 4 ]);
214
- #if (LINE_SIZE == 16 )
215
240
const __m128i a2 = _mm_loadu_si128 ((const __m128i * )& a [i + 8 ]);
216
241
const __m128i a3 = _mm_loadu_si128 ((const __m128i * )& a [i + 12 ]);
217
- #endif
218
242
const __m128i b0 = _mm_loadu_si128 ((const __m128i * )& out [i + 0 ]);
219
243
const __m128i b1 = _mm_loadu_si128 ((const __m128i * )& out [i + 4 ]);
220
- #if (LINE_SIZE == 16 )
221
244
const __m128i b2 = _mm_loadu_si128 ((const __m128i * )& out [i + 8 ]);
222
245
const __m128i b3 = _mm_loadu_si128 ((const __m128i * )& out [i + 12 ]);
223
- #endif
224
246
_mm_storeu_si128 ((__m128i * )& out [i + 0 ], _mm_add_epi32 (a0 , b0 ));
225
247
_mm_storeu_si128 ((__m128i * )& out [i + 4 ], _mm_add_epi32 (a1 , b1 ));
226
- #if (LINE_SIZE == 16 )
227
248
_mm_storeu_si128 ((__m128i * )& out [i + 8 ], _mm_add_epi32 (a2 , b2 ));
228
249
_mm_storeu_si128 ((__m128i * )& out [i + 12 ], _mm_add_epi32 (a3 , b3 ));
229
- #endif
250
+ i += 16 ;
251
+ } while (i != aligned_size );
252
+
253
+ if ((size & 8 ) != 0 ) {
254
+ const __m128i a0 = _mm_loadu_si128 ((const __m128i * )& a [i + 0 ]);
255
+ const __m128i a1 = _mm_loadu_si128 ((const __m128i * )& a [i + 4 ]);
256
+ const __m128i b0 = _mm_loadu_si128 ((const __m128i * )& out [i + 0 ]);
257
+ const __m128i b1 = _mm_loadu_si128 ((const __m128i * )& out [i + 4 ]);
258
+ _mm_storeu_si128 ((__m128i * )& out [i + 0 ], _mm_add_epi32 (a0 , b0 ));
259
+ _mm_storeu_si128 ((__m128i * )& out [i + 4 ], _mm_add_epi32 (a1 , b1 ));
260
+ i += 8 ;
230
261
}
231
- for (; i < size ; ++ i ) {
232
- out [i ] += a [i ];
262
+
263
+ size &= 7 ;
264
+ if (size == 4 ) {
265
+ const __m128i a0 = _mm_loadu_si128 ((const __m128i * )& a [i ]);
266
+ const __m128i b0 = _mm_loadu_si128 ((const __m128i * )& out [i ]);
267
+ _mm_storeu_si128 ((__m128i * )& out [i ], _mm_add_epi32 (a0 , b0 ));
268
+ } else if (size == 2 ) {
269
+ const __m128i a0 = _mm_loadl_epi64 ((const __m128i * )& a [i ]);
270
+ const __m128i b0 = _mm_loadl_epi64 ((const __m128i * )& out [i ]);
271
+ _mm_storel_epi64 ((__m128i * )& out [i ], _mm_add_epi32 (a0 , b0 ));
233
272
}
234
273
}
235
- #undef LINE_SIZE
236
274
237
275
//------------------------------------------------------------------------------
238
276
// Entropy
0 commit comments