Skip to content

Commit ce7687b

Browse files
authored
Merge pull request #1810 from SixLabors/bp/hadamardtransformsse
SSE41 version of Hadamard transform
2 parents c8f6d75 + 2e52890 commit ce7687b

File tree

2 files changed

+194
-5
lines changed

2 files changed

+194
-5
lines changed

src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

Lines changed: 142 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,16 @@
44
using System;
55
using System.Buffers.Binary;
66
using System.Runtime.CompilerServices;
7+
using System.Runtime.InteropServices;
8+
#if SUPPORTS_RUNTIME_INTRINSICS
9+
using System.Runtime.Intrinsics;
10+
using System.Runtime.Intrinsics.X86;
11+
#endif
712

813
// ReSharper disable InconsistentNaming
914
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
1015
{
11-
internal static class LossyUtils
16+
internal static unsafe class LossyUtils
1217
{
1318
[MethodImpl(InliningOptions.ShortMethod)]
1419
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@@ -61,11 +66,12 @@ public static void Copy(Span<byte> src, Span<byte> dst, int w, int h)
6166
public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
6267
{
6368
int d = 0;
69+
int dataSize = (4 * WebpConstants.Bps) - 16;
6470
for (int y = 0; y < 16 * WebpConstants.Bps; y += 4 * WebpConstants.Bps)
6571
{
6672
for (int x = 0; x < 16; x += 4)
6773
{
68-
d += Vp8Disto4X4(a.Slice(x + y), b.Slice(x + y), w, scratch);
74+
d += Vp8Disto4X4(a.Slice(x + y, dataSize), b.Slice(x + y, dataSize), w, scratch);
6975
}
7076
}
7177

@@ -75,9 +81,19 @@ public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span
7581
[MethodImpl(InliningOptions.ShortMethod)]
7682
public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
7783
{
78-
int sum1 = TTransform(a, w, scratch);
79-
int sum2 = TTransform(b, w, scratch);
80-
return Math.Abs(sum2 - sum1) >> 5;
84+
#if SUPPORTS_RUNTIME_INTRINSICS
85+
if (Sse41.IsSupported)
86+
{
87+
int diffSum = TTransformSse41(a, b, w, scratch);
88+
return Math.Abs(diffSum) >> 5;
89+
}
90+
else
91+
#endif
92+
{
93+
int sum1 = TTransform(a, w, scratch);
94+
int sum2 = TTransform(b, w, scratch);
95+
return Math.Abs(sum2 - sum1) >> 5;
96+
}
8197
}
8298

8399
public static void DC16(Span<byte> dst, Span<byte> yuv, int offset)
@@ -589,6 +605,127 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
589605
return sum;
590606
}
591607

608+
#if SUPPORTS_RUNTIME_INTRINSICS
609+
/// <summary>
610+
/// Hadamard transform
611+
/// Returns the weighted sum of the absolute value of transformed coefficients.
612+
/// w[] contains a row-major 4 by 4 symmetric matrix.
613+
/// </summary>
614+
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
615+
{
616+
Span<int> sum = scratch.Slice(0, 4);
617+
sum.Clear();
618+
619+
// Load and combine inputs.
620+
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
621+
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
622+
Vector128<byte> ina2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 2, 16)));
623+
Vector128<long> ina3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
624+
Vector128<byte> inb0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB));
625+
Vector128<byte> inb1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps, 16)));
626+
Vector128<byte> inb2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 2, 16)));
627+
Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
628+
629+
// Combine inA and inB (we'll do two transforms in parallel).
630+
Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
631+
Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
632+
Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
633+
Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
634+
Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
635+
Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
636+
Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
637+
Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
638+
639+
// a00 a01 a02 a03 b00 b01 b02 b03
640+
// a10 a11 a12 a13 b10 b11 b12 b13
641+
// a20 a21 a22 a23 b20 b21 b22 b23
642+
// a30 a31 a32 a33 b30 b31 b32 b33
643+
// Vertical pass first to avoid a transpose (vertical and horizontal passes
644+
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
645+
// Calculate a and b (two 4x4 at once).
646+
Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
647+
Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
648+
Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
649+
Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
650+
Vector128<short> b0 = Sse2.Add(a0, a1);
651+
Vector128<short> b1 = Sse2.Add(a3, a2);
652+
Vector128<short> b2 = Sse2.Subtract(a3, a2);
653+
Vector128<short> b3 = Sse2.Subtract(a0, a1);
654+
655+
// a00 a01 a02 a03 b00 b01 b02 b03
656+
// a10 a11 a12 a13 b10 b11 b12 b13
657+
// a20 a21 a22 a23 b20 b21 b22 b23
658+
// a30 a31 a32 a33 b30 b31 b32 b33
659+
// Transpose the two 4x4.
660+
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
661+
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
662+
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
663+
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
664+
665+
// a00 a10 a01 a11 a02 a12 a03 a13
666+
// a20 a30 a21 a31 a22 a32 a23 a33
667+
// b00 b10 b01 b11 b02 b12 b03 b13
668+
// b20 b30 b21 b31 b22 b32 b23 b33
669+
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
670+
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
671+
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
672+
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
673+
674+
// a00 a10 a20 a30 a01 a11 a21 a31
675+
// b00 b10 b20 b30 b01 b11 b21 b31
676+
// a02 a12 a22 a32 a03 a13 a23 a33
677+
// b02 b12 a22 b32 b03 b13 b23 b33
678+
Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
679+
Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
680+
Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
681+
Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
682+
683+
// a00 a10 a20 a30 b00 b10 b20 b30
684+
// a01 a11 a21 a31 b01 b11 b21 b31
685+
// a02 a12 a22 a32 b02 b12 b22 b32
686+
// a03 a13 a23 a33 b03 b13 b23 b33
687+
// Horizontal pass and difference of weighted sums.
688+
Vector128<ushort> w0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w));
689+
Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
690+
691+
// Calculate a and b (two 4x4 at once).
692+
a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
693+
a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
694+
a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
695+
a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
696+
b0 = Sse2.Add(a0, a1);
697+
b1 = Sse2.Add(a3, a2);
698+
b2 = Sse2.Subtract(a3, a2);
699+
b3 = Sse2.Subtract(a0, a1);
700+
701+
// Separate the transforms of inA and inB.
702+
Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
703+
Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
704+
Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
705+
Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
706+
707+
Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
708+
Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
709+
Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
710+
Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
711+
712+
// weighted sums.
713+
Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
714+
Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
715+
Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
716+
Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
717+
Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
718+
Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
719+
720+
// difference of weighted sums.
721+
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
722+
723+
ref int outputRef = ref MemoryMarshal.GetReference(sum);
724+
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
725+
return sum[3] + sum[2] + sum[1] + sum[0];
726+
}
727+
#endif
728+
592729
public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
593730
{
594731
TransformOne(src, dst, scratch);
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using SixLabors.ImageSharp.Formats.Webp.Lossy;
5+
using SixLabors.ImageSharp.Tests.TestUtilities;
6+
using Xunit;
7+
8+
namespace SixLabors.ImageSharp.Tests.Formats.WebP
9+
{
10+
[Trait("Format", "Webp")]
11+
public class LossyUtilsTests
12+
{
13+
private static void RunHadamardTransformTest()
14+
{
15+
byte[] a =
16+
{
17+
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
18+
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
19+
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
20+
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
21+
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27
22+
};
23+
24+
byte[] b =
25+
{
26+
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
27+
204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28+
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28,
29+
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
30+
204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
31+
};
32+
33+
ushort[] w = { 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 };
34+
int expected = 2;
35+
36+
int actual = LossyUtils.Vp8Disto4X4(a, b, w, new int[16]);
37+
Assert.Equal(expected, actual);
38+
}
39+
40+
[Fact]
41+
public void HadamardTransform_Works() => RunHadamardTransformTest();
42+
43+
#if SUPPORTS_RUNTIME_INTRINSICS
44+
[Fact]
45+
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
46+
47+
[Fact]
48+
public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
49+
#endif
50+
51+
}
52+
}

0 commit comments

Comments
 (0)