Skip to content

Commit 255226b

Browse files
authored
Merge pull request #1814 from SixLabors/bp/meansse
Add SSE2 version of Mean16x4
2 parents 7495a91 + 7d8225b commit 255226b

File tree

5 files changed

+132
-63
lines changed

5 files changed

+132
-63
lines changed

src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,12 @@
1313
// ReSharper disable InconsistentNaming
1414
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
1515
{
16-
internal static unsafe class LossyUtils
16+
internal static class LossyUtils
1717
{
18+
#if SUPPORTS_RUNTIME_INTRINSICS
19+
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
20+
#endif
21+
1822
[MethodImpl(InliningOptions.ShortMethod)]
1923
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
2024

@@ -938,26 +942,55 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
938942
FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
939943
}
940944

941-
[MethodImpl(InliningOptions.ShortMethod)]
942-
public static uint LoadUv(byte u, byte v) =>
943-
(uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).
944-
945-
[MethodImpl(InliningOptions.ShortMethod)]
946-
public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
945+
public static void Mean16x4(Span<byte> input, Span<uint> dc)
947946
{
948-
bgr[0] = (byte)YuvToB(y, u);
949-
bgr[1] = (byte)YuvToG(y, u, v);
950-
bgr[2] = (byte)YuvToR(y, v);
951-
}
952-
953-
[MethodImpl(InliningOptions.ShortMethod)]
954-
public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);
955-
956-
[MethodImpl(InliningOptions.ShortMethod)]
957-
public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
947+
#if SUPPORTS_RUNTIME_INTRINSICS
948+
if (Ssse3.IsSupported)
949+
{
950+
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
951+
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));
952+
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16)));
953+
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16)));
954+
Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
955+
Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
956+
Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
957+
Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
958+
Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
959+
Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
960+
Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
961+
Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
962+
Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
963+
Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
964+
Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
965+
Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
966+
Vector128<int> e0 = Sse2.Add(d0, d1);
967+
Vector128<int> e1 = Sse2.Add(d2, d3);
968+
Vector128<int> f0 = Sse2.Add(e0, e1);
969+
Vector128<short> hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16());
970+
Vector128<uint> wide = Sse2.UnpackLow(hadd, Vector128<short>.Zero).AsUInt32();
971+
972+
ref uint outputRef = ref MemoryMarshal.GetReference(dc);
973+
Unsafe.As<uint, Vector128<uint>>(ref outputRef) = wide;
974+
}
975+
else
976+
#endif
977+
{
978+
for (int k = 0; k < 4; k++)
979+
{
980+
uint avg = 0;
981+
for (int y = 0; y < 4; y++)
982+
{
983+
for (int x = 0; x < 4; x++)
984+
{
985+
avg += input[x + (y * WebpConstants.Bps)];
986+
}
987+
}
958988

959-
[MethodImpl(InliningOptions.ShortMethod)]
960-
public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
989+
dc[k] = avg;
990+
input = input.Slice(4); // go to next 4x4 block.
991+
}
992+
}
993+
}
961994

962995
[MethodImpl(InliningOptions.ShortMethod)]
963996
public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1);
@@ -1163,9 +1196,6 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh)
11631196
return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh;
11641197
}
11651198

1166-
[MethodImpl(InliningOptions.ShortMethod)]
1167-
private static int MultHi(int v, int coeff) => (v * coeff) >> 8;
1168-
11691199
[MethodImpl(InliningOptions.ShortMethod)]
11701200
private static void Store(Span<byte> dst, int x, int y, int v)
11711201
{
@@ -1188,13 +1218,6 @@ private static void Store2(Span<byte> dst, int y, int dc, int d, int c)
11881218
[MethodImpl(InliningOptions.ShortMethod)]
11891219
private static int Mul2(int a) => (a * 35468) >> 16;
11901220

1191-
[MethodImpl(InliningOptions.ShortMethod)]
1192-
private static byte Clip8(int v)
1193-
{
1194-
int yuvMask = (256 << 6) - 1;
1195-
return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255);
1196-
}
1197-
11981221
[MethodImpl(InliningOptions.ShortMethod)]
11991222
private static void Put8x8uv(byte value, Span<byte> dst)
12001223
{

src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -357,15 +357,16 @@ public int FastMbAnalyze(int quality)
357357
int q = quality;
358358
int kThreshold = 8 + ((17 - 8) * q / 100);
359359
int k;
360-
uint[] dc = new uint[16];
360+
Span<uint> dc = stackalloc uint[16];
361+
Span<ushort> tmp = stackalloc ushort[16];
361362
uint m;
362363
uint m2;
363364
for (k = 0; k < 16; k += 4)
364365
{
365-
this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.AsSpan(k));
366+
LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4));
366367
}
367368

368-
for (m = 0, m2 = 0, k = 0; k < 16; ++k)
369+
for (m = 0, m2 = 0, k = 0; k < 16; k++)
369370
{
370371
m += dc[k];
371372
m2 += dc[k] * dc[k];
@@ -823,24 +824,6 @@ public void BytesToNz()
823824
this.Nz[this.nzIdx] = nz;
824825
}
825826

826-
private void Mean16x4(Span<byte> input, Span<uint> dc)
827-
{
828-
for (int k = 0; k < 4; k++)
829-
{
830-
uint avg = 0;
831-
for (int y = 0; y < 4; y++)
832-
{
833-
for (int x = 0; x < 4; x++)
834-
{
835-
avg += input[x + (y * WebpConstants.Bps)];
836-
}
837-
}
838-
839-
dc[k] = avg;
840-
input = input.Slice(4); // go to next 4x4 block.
841-
}
842-
}
843-
844827
private void ImportBlock(Span<byte> src, int srcStride, Span<byte> dst, int w, int h, int size)
845828
{
846829
int dstIdx = 0;

src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -747,21 +747,21 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
747747
{
748748
int xStep = 3;
749749
int lastPixelPair = (len - 1) >> 1;
750-
uint tluv = LossyUtils.LoadUv(topU[0], topV[0]); // top-left sample
751-
uint luv = LossyUtils.LoadUv(curU[0], curV[0]); // left-sample
750+
uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample
751+
uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample
752752
uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
753-
LossyUtils.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
753+
YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
754754

755755
if (bottomY != null)
756756
{
757757
uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
758-
LossyUtils.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
758+
YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
759759
}
760760

761761
for (int x = 1; x <= lastPixelPair; x++)
762762
{
763-
uint tuv = LossyUtils.LoadUv(topU[x], topV[x]); // top sample
764-
uint uv = LossyUtils.LoadUv(curU[x], curV[x]); // sample
763+
uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample
764+
uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample
765765

766766
// Precompute invariant values associated with first and second diagonals.
767767
uint avg = tluv + tuv + luv + uv + 0x00080008u;
@@ -770,15 +770,15 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
770770
uv0 = (diag12 + tluv) >> 1;
771771
uint uv1 = (diag03 + tuv) >> 1;
772772
int xMul2 = x * 2;
773-
LossyUtils.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
774-
LossyUtils.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
773+
YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
774+
YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
775775

776776
if (bottomY != null)
777777
{
778778
uv0 = (diag03 + luv) >> 1;
779779
uv1 = (diag12 + uv) >> 1;
780-
LossyUtils.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
781-
LossyUtils.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
780+
YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
781+
YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
782782
}
783783

784784
tluv = tuv;
@@ -788,11 +788,11 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
788788
if ((len & 1) == 0)
789789
{
790790
uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
791-
LossyUtils.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
791+
YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
792792
if (bottomY != null)
793793
{
794794
uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
795-
LossyUtils.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
795+
YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
796796
}
797797
}
798798
}

src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,5 +299,36 @@ private static int ClipUv(int uv, int rounding)
299299
uv = (uv + rounding + (128 << (YuvFix + 2))) >> (YuvFix + 2);
300300
return (uv & ~0xff) == 0 ? uv : uv < 0 ? 0 : 255;
301301
}
302+
303+
[MethodImpl(InliningOptions.ShortMethod)]
304+
public static uint LoadUv(byte u, byte v) =>
305+
(uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).
306+
307+
[MethodImpl(InliningOptions.ShortMethod)]
308+
public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
309+
{
310+
bgr[2] = (byte)YuvToR(y, v);
311+
bgr[1] = (byte)YuvToG(y, u, v);
312+
bgr[0] = (byte)YuvToB(y, u);
313+
}
314+
315+
[MethodImpl(InliningOptions.ShortMethod)]
316+
public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);
317+
318+
[MethodImpl(InliningOptions.ShortMethod)]
319+
public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
320+
321+
[MethodImpl(InliningOptions.ShortMethod)]
322+
public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
323+
324+
[MethodImpl(InliningOptions.ShortMethod)]
325+
private static int MultHi(int v, int coeff) => (v * coeff) >> 8;
326+
327+
[MethodImpl(InliningOptions.ShortMethod)]
328+
private static byte Clip8(int v)
329+
{
330+
int yuvMask = (256 << 6) - 1;
331+
return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255);
332+
}
302333
}
303334
}

tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Apache License, Version 2.0.
33

4+
using System.Linq;
45
using SixLabors.ImageSharp.Formats.Webp.Lossy;
56
using SixLabors.ImageSharp.Tests.TestUtilities;
67
using Xunit;
@@ -10,6 +11,29 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
1011
[Trait("Format", "Webp")]
1112
public class LossyUtilsTests
1213
{
14+
private static void RunMean16x4Test()
15+
{
16+
// arrange
17+
byte[] input =
18+
{
19+
154, 145, 102, 115, 127, 129, 126, 125, 126, 120, 133, 152, 157, 153, 119, 94, 104, 116, 111, 113,
20+
113, 109, 105, 124, 173, 175, 177, 170, 175, 172, 166, 164, 151, 141, 99, 114, 125, 126, 135, 150,
21+
133, 115, 127, 149, 141, 168, 100, 54, 110, 117, 115, 116, 119, 115, 117, 130, 174, 174, 174, 157,
22+
146, 171, 166, 158, 117, 140, 96, 111, 119, 119, 136, 171, 188, 134, 121, 126, 136, 119, 59, 77,
23+
109, 115, 113, 120, 120, 117, 128, 115, 174, 173, 173, 161, 152, 148, 153, 162, 105, 140, 96, 114,
24+
115, 122, 141, 173, 190, 190, 142, 106, 151, 78, 66, 141, 110, 117, 123, 136, 118, 124, 127, 114,
25+
173, 175, 166, 155, 155, 159, 159, 158
26+
};
27+
uint[] dc = new uint[4];
28+
uint[] expectedDc = { 1940, 2139, 2252, 1813 };
29+
30+
// act
31+
LossyUtils.Mean16x4(input, dc);
32+
33+
// assert
34+
Assert.True(dc.SequenceEqual(expectedDc));
35+
}
36+
1337
private static void RunHadamardTransformTest()
1438
{
1539
byte[] a =
@@ -37,16 +61,24 @@ private static void RunHadamardTransformTest()
3761
Assert.Equal(expected, actual);
3862
}
3963

64+
[Fact]
65+
public void Mean16x4_Works() => RunMean16x4Test();
66+
4067
[Fact]
4168
public void HadamardTransform_Works() => RunHadamardTransformTest();
4269

4370
#if SUPPORTS_RUNTIME_INTRINSICS
71+
[Fact]
72+
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
73+
74+
[Fact]
75+
public void Mean16x4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableHWIntrinsic);
76+
4477
[Fact]
4578
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
4679

4780
[Fact]
4881
public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
4982
#endif
50-
5183
}
5284
}

0 commit comments

Comments
 (0)