Skip to content

Commit 0ab789e

Browse files
jzernGerrit Code Review
authored andcommitted
Merge changes I6dfedfd5,I2376e2dc into main
* changes: rework AddVectorEq_SSE2 rework AddVector_SSE2
2 parents 0323645 + 61e2cfd commit 0ab789e

File tree

1 file changed

+60
-22
lines changed

1 file changed

+60
-22
lines changed

src/dsp/lossless_enc_sse2.c

Lines changed: 60 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -175,64 +175,102 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
175175

176176
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
177177
// that's ok since the histogram values are less than 1<<28 (max picture size).
178-
#define LINE_SIZE 16 // 8 or 16
179178
static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
180179
const uint32_t* WEBP_RESTRICT b,
181180
uint32_t* WEBP_RESTRICT out, int size) {
182-
int i;
183-
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
181+
int i = 0;
182+
int aligned_size = size & ~15;
183+
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
184+
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
185+
// 2). See the usage in VP8LHistogramAdd().
186+
assert(size >= 16);
187+
assert(size % 2 == 0);
188+
189+
do {
184190
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
185191
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
186-
#if (LINE_SIZE == 16)
187192
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
188193
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
189-
#endif
190194
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
191195
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
192-
#if (LINE_SIZE == 16)
193196
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
194197
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
195-
#endif
196198
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
197199
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
198-
#if (LINE_SIZE == 16)
199200
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
200201
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
201-
#endif
202+
i += 16;
203+
} while (i != aligned_size);
204+
205+
if ((size & 8) != 0) {
206+
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
207+
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
208+
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
209+
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
210+
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
211+
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
212+
i += 8;
202213
}
203-
for (; i < size; ++i) {
204-
out[i] = a[i] + b[i];
214+
215+
size &= 7;
216+
if (size == 4) {
217+
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
218+
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
219+
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
220+
} else if (size == 2) {
221+
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
222+
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]);
223+
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
205224
}
206225
}
207226

208227
static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
209228
uint32_t* WEBP_RESTRICT out, int size) {
210-
int i;
211-
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
229+
int i = 0;
230+
int aligned_size = size & ~15;
231+
// Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
232+
// NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
233+
// 2). See the usage in VP8LHistogramAdd().
234+
assert(size >= 16);
235+
assert(size % 2 == 0);
236+
237+
do {
212238
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
213239
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
214-
#if (LINE_SIZE == 16)
215240
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
216241
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
217-
#endif
218242
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
219243
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
220-
#if (LINE_SIZE == 16)
221244
const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
222245
const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
223-
#endif
224246
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
225247
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
226-
#if (LINE_SIZE == 16)
227248
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
228249
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
229-
#endif
250+
i += 16;
251+
} while (i != aligned_size);
252+
253+
if ((size & 8) != 0) {
254+
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
255+
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
256+
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
257+
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
258+
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
259+
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
260+
i += 8;
230261
}
231-
for (; i < size; ++i) {
232-
out[i] += a[i];
262+
263+
size &= 7;
264+
if (size == 4) {
265+
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
266+
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]);
267+
_mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
268+
} else if (size == 2) {
269+
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
270+
const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]);
271+
_mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
233272
}
234273
}
235-
#undef LINE_SIZE
236274

237275
//------------------------------------------------------------------------------
238276
// Entropy

0 commit comments

Comments
 (0)