Skip to content

Commit bf7362c

Browse files
authored
Merge pull request #1881 from SixLabors/bp/Vp8_Sse16xN
Add SSE2 and AVX2 versions of Vp8_Sse16X16 and Vp8_Sse16X8
2 parents b14a78c + 43fc8e3 commit bf7362c

File tree

2 files changed

+274
-2
lines changed

2 files changed

+274
-2
lines changed

src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

Lines changed: 132 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,43 @@ internal static class LossyUtils
4040

4141
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
4242
[MethodImpl(InliningOptions.ShortMethod)]
43-
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
43+
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)
44+
{
45+
#if SUPPORTS_RUNTIME_INTRINSICS
46+
if (Avx2.IsSupported)
47+
{
48+
return Vp8_Sse16xN_Avx2(a, b, 4);
49+
}
50+
51+
if (Sse2.IsSupported)
52+
{
53+
return Vp8_Sse16xN_Sse2(a, b, 8);
54+
}
55+
#endif
56+
{
57+
return Vp8_SseNxN(a, b, 16, 16);
58+
}
59+
}
4460

4561
// Note: method name in libwebp reference implementation is called VP8SSE16x8.
4662
[MethodImpl(InliningOptions.ShortMethod)]
47-
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
63+
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b)
64+
{
65+
#if SUPPORTS_RUNTIME_INTRINSICS
66+
if (Avx2.IsSupported)
67+
{
68+
return Vp8_Sse16xN_Avx2(a, b, 2);
69+
}
70+
71+
if (Sse2.IsSupported)
72+
{
73+
return Vp8_Sse16xN_Sse2(a, b, 4);
74+
}
75+
#endif
76+
{
77+
return Vp8_SseNxN(a, b, 16, 8);
78+
}
79+
}
4880

4981
// Note: method name in libwebp reference implementation is called VP8SSE4x4.
5082
[MethodImpl(InliningOptions.ShortMethod)]
@@ -146,6 +178,104 @@ public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
146178
return count;
147179
}
148180

181+
#if SUPPORTS_RUNTIME_INTRINSICS
182+
[MethodImpl(InliningOptions.ShortMethod)]
183+
private static int Vp8_Sse16xN_Sse2(Span<byte> a, Span<byte> b, int numPairs)
184+
{
185+
Vector128<int> sum = Vector128<int>.Zero;
186+
nint offset = 0;
187+
ref byte aRef = ref MemoryMarshal.GetReference(a);
188+
ref byte bRef = ref MemoryMarshal.GetReference(b);
189+
for (int i = 0; i < numPairs; i++)
190+
{
191+
// Load values.
192+
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset));
193+
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset));
194+
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps));
195+
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps));
196+
197+
Vector128<int> sum1 = SubtractAndAccumulate(a0, b0);
198+
Vector128<int> sum2 = SubtractAndAccumulate(a1, b1);
199+
sum = Sse2.Add(sum, Sse2.Add(sum1, sum2));
200+
201+
offset += 2 * WebpConstants.Bps;
202+
}
203+
204+
return Numerics.ReduceSum(sum);
205+
}
206+
207+
[MethodImpl(InliningOptions.ShortMethod)]
208+
private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
209+
{
210+
Vector256<int> sum = Vector256<int>.Zero;
211+
nint offset = 0;
212+
ref byte aRef = ref MemoryMarshal.GetReference(a);
213+
ref byte bRef = ref MemoryMarshal.GetReference(b);
214+
for (int i = 0; i < numPairs; i++)
215+
{
216+
// Load values.
217+
var a0 = Vector256.Create(
218+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset)),
219+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)));
220+
var b0 = Vector256.Create(
221+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset)),
222+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)));
223+
var a1 = Vector256.Create(
224+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))),
225+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps))));
226+
var b1 = Vector256.Create(
227+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))),
228+
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps))));
229+
230+
Vector256<int> sum1 = SubtractAndAccumulate(a0, b0);
231+
Vector256<int> sum2 = SubtractAndAccumulate(a1, b1);
232+
sum = Avx2.Add(sum, Avx2.Add(sum1, sum2));
233+
234+
offset += 4 * WebpConstants.Bps;
235+
}
236+
237+
return Numerics.ReduceSum(sum);
238+
}
239+
240+
[MethodImpl(InliningOptions.ShortMethod)]
241+
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
242+
{
243+
// Take abs(a-b) in 8b.
244+
Vector128<byte> ab = Sse2.SubtractSaturate(a, b);
245+
Vector128<byte> ba = Sse2.SubtractSaturate(b, a);
246+
Vector128<byte> absAb = Sse2.Or(ab, ba);
247+
248+
// Zero-extend to 16b.
249+
Vector128<byte> c0 = Sse2.UnpackLow(absAb, Vector128<byte>.Zero);
250+
Vector128<byte> c1 = Sse2.UnpackHigh(absAb, Vector128<byte>.Zero);
251+
252+
// Multiply with self.
253+
Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
254+
Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
255+
256+
return Sse2.Add(sum1, sum2);
257+
}
258+
259+
[MethodImpl(InliningOptions.ShortMethod)]
260+
private static Vector256<int> SubtractAndAccumulate(Vector256<byte> a, Vector256<byte> b)
261+
{
262+
// Take abs(a-b) in 8b.
263+
Vector256<byte> ab = Avx2.SubtractSaturate(a, b);
264+
Vector256<byte> ba = Avx2.SubtractSaturate(b, a);
265+
Vector256<byte> absAb = Avx2.Or(ab, ba);
266+
267+
// Zero-extend to 16b.
268+
Vector256<byte> c0 = Avx2.UnpackLow(absAb, Vector256<byte>.Zero);
269+
Vector256<byte> c1 = Avx2.UnpackHigh(absAb, Vector256<byte>.Zero);
270+
271+
// Multiply with self.
272+
Vector256<int> sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
273+
Vector256<int> sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
274+
275+
return Avx2.Add(sum1, sum2);
276+
}
277+
#endif
278+
149279
[MethodImpl(InliningOptions.ShortMethod)]
150280
public static void Vp8Copy4X4(Span<byte> src, Span<byte> dst) => Copy(src, dst, 4, 4);
151281

tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,124 @@ private static void RunTransformOneTest()
7676
Assert.True(expected.SequenceEqual(dst));
7777
}
7878

79+
private static void RunVp8Sse16X16Test()
80+
{
81+
// arrange
82+
byte[] a =
83+
{
84+
154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103,
85+
101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159,
86+
164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168,
87+
170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107,
88+
104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151,
89+
148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117,
90+
170, 170, 169, 171, 171, 179, 173, 175, 149, 151, 152, 151, 148, 154, 162, 157, 154, 154, 151, 132,
91+
92, 89, 101, 108, 104, 102, 101, 101, 103, 103, 123, 118, 171, 168, 177, 173, 171, 178, 172, 176,
92+
152, 152, 152, 151, 154, 162, 161, 155, 149, 157, 156, 129, 92, 87, 101, 107, 102, 100, 107, 100,
93+
101, 102, 123, 118, 170, 175, 182, 172, 171, 179, 173, 175, 152, 151, 154, 155, 160, 162, 161, 153,
94+
150, 156, 153, 129, 92, 91, 102, 106, 100, 109, 115, 99, 101, 102, 124, 120, 171, 179, 178, 172,
95+
171, 181, 171, 173, 154, 154, 154, 162, 160, 158, 156, 152, 153, 157, 151, 128, 86, 86, 102, 105,
96+
102, 122, 114, 99, 101, 102, 125, 120, 178, 173, 177, 172, 171, 180, 172, 173, 154, 152, 158, 163,
97+
150, 148, 148, 156, 151, 158, 152, 129, 87, 87, 101, 105, 204, 204, 204, 204, 204, 204, 204, 204,
98+
204, 204, 204, 204, 204, 204, 204, 204, 154, 151, 165, 156, 141, 137, 146, 158, 152, 159, 152, 133,
99+
90, 88, 99, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
100+
154, 160, 164, 150, 126, 127, 149, 159, 155, 161, 153, 131, 84, 86, 97, 103, 204, 204, 204, 204,
101+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 157, 167, 157, 137, 102, 128, 155, 161,
102+
157, 159, 154, 134, 84, 82, 97, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
103+
204, 204, 204, 204, 163, 163, 150, 113, 78, 132, 156, 162, 159, 160, 154, 132, 83, 78, 91, 97, 204,
104+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 163, 157, 137, 80, 78,
105+
131, 154, 163, 157, 159, 149, 131, 82, 77, 94, 100, 204, 204, 204, 204, 204, 204, 204, 204, 204,
106+
204, 204, 204, 204, 204, 204, 204, 159, 151, 108, 72, 88, 132, 156, 162, 159, 157, 151, 130, 79, 78,
107+
95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 151, 130,
108+
82, 82, 89, 134, 154, 161, 161, 157, 152, 129, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204,
109+
204, 204, 204, 204, 204, 204, 204, 204, 204
110+
};
111+
112+
byte[] b =
113+
{
114+
150, 150, 150, 150, 146, 149, 152, 154, 164, 166, 154, 132, 99, 92, 106, 112, 204, 204, 204, 204,
115+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 150, 150, 150, 150, 146, 149, 152, 154,
116+
161, 164, 151, 130, 93, 86, 100, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
117+
204, 204, 204, 204, 150, 150, 150, 150, 146, 149, 152, 154, 158, 161, 148, 127, 93, 86, 100, 106,
118+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 150, 150, 150, 150,
119+
146, 149, 152, 154, 156, 159, 146, 125, 99, 92, 106, 112, 204, 204, 204, 204, 204, 204, 204, 204,
120+
204, 204, 204, 204, 204, 204, 204, 204, 148, 148, 148, 148, 149, 158, 162, 159, 155, 155, 153, 129,
121+
94, 87, 101, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
122+
151, 151, 151, 151, 152, 159, 161, 156, 155, 155, 153, 129, 94, 87, 101, 106, 204, 204, 204, 204,
123+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 154, 154, 154, 154, 156, 161, 159, 152,
124+
155, 155, 153, 129, 94, 87, 101, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
125+
204, 204, 204, 204, 156, 156, 156, 156, 159, 162, 158, 149, 155, 155, 153, 129, 94, 87, 101, 106,
126+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 152, 153, 157, 162,
127+
150, 149, 149, 151, 155, 160, 150, 131, 91, 90, 104, 104, 204, 204, 204, 204, 204, 204, 204, 204,
128+
204, 204, 204, 204, 204, 204, 204, 204, 152, 156, 158, 157, 140, 137, 145, 159, 155, 160, 150, 131,
129+
89, 88, 102, 101, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
130+
153, 161, 160, 149, 118, 128, 147, 162, 155, 160, 150, 131, 86, 85, 99, 98, 204, 204, 204, 204, 204,
131+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 154, 165, 161, 144, 96, 128, 154, 159, 155,
132+
160, 150, 131, 83, 82, 97, 96, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
133+
204, 204, 161, 160, 149, 105, 78, 127, 156, 170, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204,
134+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 160, 160, 133, 85, 81, 129, 155,
135+
167, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
136+
204, 204, 204, 204, 204, 156, 147, 109, 76, 85, 130, 153, 163, 156, 156, 154, 130, 81, 77, 95, 102,
137+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 152, 128, 87, 83,
138+
88, 132, 152, 159, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204,
139+
204, 204, 204, 204, 204, 204, 204
140+
};
141+
142+
int expected = 2063;
143+
144+
// act
145+
int actual = LossyUtils.Vp8_Sse16X16(a, b);
146+
147+
// assert
148+
Assert.Equal(expected, actual);
149+
}
150+
151+
private static void RunVp8Sse16X8Test()
152+
{
153+
// arrange
154+
byte[] a =
155+
{
156+
107, 104, 104, 103, 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150,
157+
147, 147, 146, 159, 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117,
158+
172, 172, 172, 168, 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126,
159+
93, 90, 102, 107, 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175,
160+
150, 149, 152, 151, 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100,
161+
102, 102, 121, 117, 170, 170, 169, 171, 171, 179, 173, 175, 149, 151, 152, 151, 148, 154, 162, 157,
162+
154, 154, 151, 132, 92, 89, 101, 108, 104, 102, 101, 101, 103, 103, 123, 118, 171, 168, 177, 173,
163+
171, 178, 172, 176, 152, 152, 152, 151, 154, 162, 161, 155, 149, 157, 156, 129, 92, 87, 101, 107,
164+
102, 100, 107, 100, 101, 102, 123, 118, 170, 175, 182, 172, 171, 179, 173, 175, 152, 151, 154, 155,
165+
160, 162, 161, 153, 150, 156, 153, 129, 92, 91, 102, 106, 100, 109, 115, 99, 101, 102, 124, 120,
166+
171, 179, 178, 172, 171, 181, 171, 173, 154, 154, 154, 162, 160, 158, 156, 152, 153, 157, 151, 128,
167+
86, 86, 102, 105, 102, 122, 114, 99, 101, 102, 125, 120, 178, 173, 177, 172, 171, 180, 172, 173,
168+
154, 152, 158, 163, 150, 148, 148, 156, 151, 158, 152, 129, 87, 87, 101, 105
169+
};
170+
171+
byte[] b =
172+
{
173+
103, 103, 103, 103, 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, 150, 150, 150, 150,
174+
146, 149, 152, 154, 161, 164, 151, 130, 93, 86, 100, 106, 103, 103, 103, 103, 101, 106, 122, 114,
175+
171, 171, 171, 171, 171, 177, 169, 175, 150, 150, 150, 150, 146, 149, 152, 154, 158, 161, 148, 127,
176+
93, 86, 100, 106, 103, 103, 103, 103, 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175,
177+
150, 150, 150, 150, 146, 149, 152, 154, 156, 159, 146, 125, 99, 92, 106, 112, 103, 103, 103, 103,
178+
101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, 148, 148, 148, 148, 149, 158, 162, 159,
179+
155, 155, 153, 129, 94, 87, 101, 106, 102, 100, 100, 102, 100, 101, 120, 122, 170, 176, 176, 170,
180+
174, 180, 171, 177, 151, 151, 151, 151, 152, 159, 161, 156, 155, 155, 153, 129, 94, 87, 101, 106,
181+
102, 105, 105, 102, 100, 101, 120, 122, 170, 176, 176, 170, 174, 180, 171, 177, 154, 154, 154, 154,
182+
156, 161, 159, 152, 155, 155, 153, 129, 94, 87, 101, 106, 102, 112, 112, 102, 100, 101, 120, 122,
183+
170, 176, 176, 170, 174, 180, 171, 177, 156, 156, 156, 156, 159, 162, 158, 149, 155, 155, 153, 129,
184+
94, 87, 101, 106, 102, 117, 117, 102, 100, 101, 120, 122, 170, 176, 176, 170, 174, 180, 171, 177,
185+
152, 153, 157, 162, 150, 149, 149, 151, 155, 160, 150, 131, 91, 90, 104, 104
186+
};
187+
188+
int expected = 749;
189+
190+
// act
191+
int actual = LossyUtils.Vp8_Sse16X8(a, b);
192+
193+
// assert
194+
Assert.Equal(expected, actual);
195+
}
196+
79197
private static void RunVp8Sse4X4Test()
80198
{
81199
// arrange
@@ -168,6 +286,12 @@ private static void RunHadamardTransformTest()
168286
[Fact]
169287
public void RunTransformOne_Works() => RunTransformOneTest();
170288

289+
[Fact]
290+
public void Vp8Sse16X16_Works() => RunVp8Sse16X16Test();
291+
292+
[Fact]
293+
public void Vp8Sse16X8_Works() => RunVp8Sse16X8Test();
294+
171295
[Fact]
172296
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();
173297

@@ -190,6 +314,24 @@ private static void RunHadamardTransformTest()
190314
[Fact]
191315
public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
192316

317+
[Fact]
318+
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
319+
320+
[Fact]
321+
public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2);
322+
323+
[Fact]
324+
public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
325+
326+
[Fact]
327+
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
328+
329+
[Fact]
330+
public void Vp8Sse16X8_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2);
331+
332+
[Fact]
333+
public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
334+
193335
[Fact]
194336
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
195337

0 commit comments

Comments
 (0)