Skip to content

Commit 863bddb

Browse files
Merge branch 'master' into js/convolution-experiments
2 parents 9e3dd8a + 1f351ee commit 863bddb

File tree

10 files changed

+848
-63
lines changed

10 files changed

+848
-63
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
using System.Runtime.InteropServices;
88
using System.Runtime.Intrinsics;
99
using System.Runtime.Intrinsics.X86;
10+
using SixLabors.ImageSharp.PixelFormats;
1011

1112
namespace SixLabors.ImageSharp
1213
{
@@ -22,6 +23,20 @@ public static class HwIntrinsics
2223

2324
private static ReadOnlySpan<byte> ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 };
2425

26+
private static ReadOnlySpan<byte> ShuffleMaskShiftAlpha =>
27+
new byte[]
28+
{
29+
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15,
30+
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15
31+
};
32+
33+
public static ReadOnlySpan<byte> PermuteMaskShiftAlpha8x32 =>
34+
new byte[]
35+
{
36+
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0,
37+
5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0
38+
};
39+
2540
/// <summary>
2641
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
2742
/// using the control and store the results in <paramref name="dest"/>.
@@ -789,6 +804,138 @@ internal static void NormalizedFloatToByteSaturate(
789804
}
790805
}
791806
}
807+
808+
internal static void PackFromRgbPlanesAvx2Reduce(
809+
ref ReadOnlySpan<byte> redChannel,
810+
ref ReadOnlySpan<byte> greenChannel,
811+
ref ReadOnlySpan<byte> blueChannel,
812+
ref Span<Rgb24> destination)
813+
{
814+
ref Vector256<byte> rBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(redChannel));
815+
ref Vector256<byte> gBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(greenChannel));
816+
ref Vector256<byte> bBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(blueChannel));
817+
ref byte dBase = ref Unsafe.As<Rgb24, byte>(ref MemoryMarshal.GetReference(destination));
818+
819+
int count = redChannel.Length / Vector256<byte>.Count;
820+
821+
ref byte control1Bytes = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32);
822+
Vector256<uint> control1 = Unsafe.As<byte, Vector256<uint>>(ref control1Bytes);
823+
824+
ref byte control2Bytes = ref MemoryMarshal.GetReference(PermuteMaskShiftAlpha8x32);
825+
Vector256<uint> control2 = Unsafe.As<byte, Vector256<uint>>(ref control2Bytes);
826+
827+
Vector256<byte> a = Vector256.Create((byte)255);
828+
829+
Vector256<byte> shuffleAlpha = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ShuffleMaskShiftAlpha));
830+
831+
for (int i = 0; i < count; i++)
832+
{
833+
Vector256<byte> r0 = Unsafe.Add(ref rBase, i);
834+
Vector256<byte> g0 = Unsafe.Add(ref gBase, i);
835+
Vector256<byte> b0 = Unsafe.Add(ref bBase, i);
836+
837+
r0 = Avx2.PermuteVar8x32(r0.AsUInt32(), control1).AsByte();
838+
g0 = Avx2.PermuteVar8x32(g0.AsUInt32(), control1).AsByte();
839+
b0 = Avx2.PermuteVar8x32(b0.AsUInt32(), control1).AsByte();
840+
841+
Vector256<byte> rg = Avx2.UnpackLow(r0, g0);
842+
Vector256<byte> b1 = Avx2.UnpackLow(b0, a);
843+
844+
Vector256<byte> rgb1 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte();
845+
Vector256<byte> rgb2 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte();
846+
847+
rg = Avx2.UnpackHigh(r0, g0);
848+
b1 = Avx2.UnpackHigh(b0, a);
849+
850+
Vector256<byte> rgb3 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte();
851+
Vector256<byte> rgb4 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte();
852+
853+
rgb1 = Avx2.Shuffle(rgb1, shuffleAlpha);
854+
rgb2 = Avx2.Shuffle(rgb2, shuffleAlpha);
855+
rgb3 = Avx2.Shuffle(rgb3, shuffleAlpha);
856+
rgb4 = Avx2.Shuffle(rgb4, shuffleAlpha);
857+
858+
rgb1 = Avx2.PermuteVar8x32(rgb1.AsUInt32(), control2).AsByte();
859+
rgb2 = Avx2.PermuteVar8x32(rgb2.AsUInt32(), control2).AsByte();
860+
rgb3 = Avx2.PermuteVar8x32(rgb3.AsUInt32(), control2).AsByte();
861+
rgb4 = Avx2.PermuteVar8x32(rgb4.AsUInt32(), control2).AsByte();
862+
863+
ref byte d1 = ref Unsafe.Add(ref dBase, 24 * 4 * i);
864+
ref byte d2 = ref Unsafe.Add(ref d1, 24);
865+
ref byte d3 = ref Unsafe.Add(ref d2, 24);
866+
ref byte d4 = ref Unsafe.Add(ref d3, 24);
867+
868+
Unsafe.As<byte, Vector256<byte>>(ref d1) = rgb1;
869+
Unsafe.As<byte, Vector256<byte>>(ref d2) = rgb2;
870+
Unsafe.As<byte, Vector256<byte>>(ref d3) = rgb3;
871+
Unsafe.As<byte, Vector256<byte>>(ref d4) = rgb4;
872+
}
873+
874+
int slice = count * Vector256<byte>.Count;
875+
redChannel = redChannel.Slice(slice);
876+
greenChannel = greenChannel.Slice(slice);
877+
blueChannel = blueChannel.Slice(slice);
878+
destination = destination.Slice(slice);
879+
}
880+
881+
internal static void PackFromRgbPlanesAvx2Reduce(
882+
ref ReadOnlySpan<byte> redChannel,
883+
ref ReadOnlySpan<byte> greenChannel,
884+
ref ReadOnlySpan<byte> blueChannel,
885+
ref Span<Rgba32> destination)
886+
{
887+
ref Vector256<byte> rBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(redChannel));
888+
ref Vector256<byte> gBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(greenChannel));
889+
ref Vector256<byte> bBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(blueChannel));
890+
ref Vector256<byte> dBase = ref Unsafe.As<Rgba32, Vector256<byte>>(ref MemoryMarshal.GetReference(destination));
891+
892+
int count = redChannel.Length / Vector256<byte>.Count;
893+
894+
ref byte control1Bytes = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32);
895+
Vector256<uint> control1 = Unsafe.As<byte, Vector256<uint>>(ref control1Bytes);
896+
897+
ref byte control2Bytes = ref MemoryMarshal.GetReference(PermuteMaskShiftAlpha8x32);
898+
Vector256<uint> control2 = Unsafe.As<byte, Vector256<uint>>(ref control2Bytes);
899+
900+
Vector256<byte> a = Vector256.Create((byte)255);
901+
902+
Vector256<byte> shuffleAlpha = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ShuffleMaskShiftAlpha));
903+
904+
for (int i = 0; i < count; i++)
905+
{
906+
Vector256<byte> r0 = Unsafe.Add(ref rBase, i);
907+
Vector256<byte> g0 = Unsafe.Add(ref gBase, i);
908+
Vector256<byte> b0 = Unsafe.Add(ref bBase, i);
909+
910+
r0 = Avx2.PermuteVar8x32(r0.AsUInt32(), control1).AsByte();
911+
g0 = Avx2.PermuteVar8x32(g0.AsUInt32(), control1).AsByte();
912+
b0 = Avx2.PermuteVar8x32(b0.AsUInt32(), control1).AsByte();
913+
914+
Vector256<byte> rg = Avx2.UnpackLow(r0, g0);
915+
Vector256<byte> b1 = Avx2.UnpackLow(b0, a);
916+
917+
Vector256<byte> rgb1 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte();
918+
Vector256<byte> rgb2 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte();
919+
920+
rg = Avx2.UnpackHigh(r0, g0);
921+
b1 = Avx2.UnpackHigh(b0, a);
922+
923+
Vector256<byte> rgb3 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte();
924+
Vector256<byte> rgb4 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte();
925+
926+
ref Vector256<byte> d0 = ref Unsafe.Add(ref dBase, i * 4);
927+
d0 = rgb1;
928+
Unsafe.Add(ref d0, 1) = rgb2;
929+
Unsafe.Add(ref d0, 2) = rgb3;
930+
Unsafe.Add(ref d0, 3) = rgb4;
931+
}
932+
933+
int slice = count * Vector256<byte>.Count;
934+
redChannel = redChannel.Slice(slice);
935+
greenChannel = greenChannel.Slice(slice);
936+
blueChannel = blueChannel.Slice(slice);
937+
destination = destination.Slice(slice);
938+
}
792939
}
793940
}
794941
}
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using System;
5+
using System.Runtime.CompilerServices;
6+
using System.Runtime.InteropServices;
7+
using SixLabors.ImageSharp.PixelFormats;
8+
9+
#if SUPPORTS_RUNTIME_INTRINSICS
10+
using System.Runtime.Intrinsics;
11+
using System.Runtime.Intrinsics.X86;
12+
#endif
13+
14+
namespace SixLabors.ImageSharp
15+
{
16+
internal static partial class SimdUtils
17+
{
18+
[MethodImpl(InliningOptions.ShortMethod)]
19+
internal static void PackFromRgbPlanes(
20+
Configuration configuration,
21+
ReadOnlySpan<byte> redChannel,
22+
ReadOnlySpan<byte> greenChannel,
23+
ReadOnlySpan<byte> blueChannel,
24+
Span<Rgb24> destination)
25+
{
26+
DebugGuard.IsTrue(greenChannel.Length == redChannel.Length, nameof(greenChannel), "Channels must be of same size!");
27+
DebugGuard.IsTrue(blueChannel.Length == redChannel.Length, nameof(blueChannel), "Channels must be of same size!");
28+
DebugGuard.IsTrue(destination.Length > redChannel.Length + 2, nameof(destination), "'destination' must contain a padding of 3 elements!");
29+
30+
#if SUPPORTS_RUNTIME_INTRINSICS
31+
if (Avx2.IsSupported)
32+
{
33+
HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination);
34+
}
35+
else
36+
#endif
37+
{
38+
PackFromRgbPlanesScalarBatchedReduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination);
39+
}
40+
41+
PackFromRgbPlanesRemainder(redChannel, greenChannel, blueChannel, destination);
42+
}
43+
44+
[MethodImpl(InliningOptions.ShortMethod)]
45+
internal static void PackFromRgbPlanes(
46+
Configuration configuration,
47+
ReadOnlySpan<byte> redChannel,
48+
ReadOnlySpan<byte> greenChannel,
49+
ReadOnlySpan<byte> blueChannel,
50+
Span<Rgba32> destination)
51+
{
52+
DebugGuard.IsTrue(greenChannel.Length == redChannel.Length, nameof(greenChannel), "Channels must be of same size!");
53+
DebugGuard.IsTrue(blueChannel.Length == redChannel.Length, nameof(blueChannel), "Channels must be of same size!");
54+
DebugGuard.IsTrue(destination.Length > redChannel.Length, nameof(destination), "'destination' span should not be shorter than the source channels!");
55+
56+
#if SUPPORTS_RUNTIME_INTRINSICS
57+
if (Avx2.IsSupported)
58+
{
59+
HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination);
60+
}
61+
else
62+
#endif
63+
{
64+
PackFromRgbPlanesScalarBatchedReduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination);
65+
}
66+
67+
PackFromRgbPlanesRemainder(redChannel, greenChannel, blueChannel, destination);
68+
}
69+
70+
private static void PackFromRgbPlanesScalarBatchedReduce(
71+
ref ReadOnlySpan<byte> redChannel,
72+
ref ReadOnlySpan<byte> greenChannel,
73+
ref ReadOnlySpan<byte> blueChannel,
74+
ref Span<Rgb24> destination)
75+
{
76+
ref ByteTuple4 r = ref Unsafe.As<byte, ByteTuple4>(ref MemoryMarshal.GetReference(redChannel));
77+
ref ByteTuple4 g = ref Unsafe.As<byte, ByteTuple4>(ref MemoryMarshal.GetReference(greenChannel));
78+
ref ByteTuple4 b = ref Unsafe.As<byte, ByteTuple4>(ref MemoryMarshal.GetReference(blueChannel));
79+
ref Rgb24 rgb = ref MemoryMarshal.GetReference(destination);
80+
81+
int count = redChannel.Length / 4;
82+
for (int i = 0; i < count; i++)
83+
{
84+
ref Rgb24 d0 = ref Unsafe.Add(ref rgb, i * 4);
85+
ref Rgb24 d1 = ref Unsafe.Add(ref d0, 1);
86+
ref Rgb24 d2 = ref Unsafe.Add(ref d0, 2);
87+
ref Rgb24 d3 = ref Unsafe.Add(ref d0, 3);
88+
89+
ref ByteTuple4 rr = ref Unsafe.Add(ref r, i);
90+
ref ByteTuple4 gg = ref Unsafe.Add(ref g, i);
91+
ref ByteTuple4 bb = ref Unsafe.Add(ref b, i);
92+
93+
d0.R = rr.V0;
94+
d0.G = gg.V0;
95+
d0.B = bb.V0;
96+
97+
d1.R = rr.V1;
98+
d1.G = gg.V1;
99+
d1.B = bb.V1;
100+
101+
d2.R = rr.V2;
102+
d2.G = gg.V2;
103+
d2.B = bb.V2;
104+
105+
d3.R = rr.V3;
106+
d3.G = gg.V3;
107+
d3.B = bb.V3;
108+
}
109+
110+
int finished = count * 4;
111+
redChannel = redChannel.Slice(finished);
112+
greenChannel = greenChannel.Slice(finished);
113+
blueChannel = blueChannel.Slice(finished);
114+
destination = destination.Slice(finished);
115+
}
116+
117+
private static void PackFromRgbPlanesScalarBatchedReduce(
118+
ref ReadOnlySpan<byte> redChannel,
119+
ref ReadOnlySpan<byte> greenChannel,
120+
ref ReadOnlySpan<byte> blueChannel,
121+
ref Span<Rgba32> destination)
122+
{
123+
ref ByteTuple4 r = ref Unsafe.As<byte, ByteTuple4>(ref MemoryMarshal.GetReference(redChannel));
124+
ref ByteTuple4 g = ref Unsafe.As<byte, ByteTuple4>(ref MemoryMarshal.GetReference(greenChannel));
125+
ref ByteTuple4 b = ref Unsafe.As<byte, ByteTuple4>(ref MemoryMarshal.GetReference(blueChannel));
126+
ref Rgba32 rgb = ref MemoryMarshal.GetReference(destination);
127+
128+
int count = redChannel.Length / 4;
129+
destination.Fill(new Rgba32(0, 0, 0, 255));
130+
for (int i = 0; i < count; i++)
131+
{
132+
ref Rgba32 d0 = ref Unsafe.Add(ref rgb, i * 4);
133+
ref Rgba32 d1 = ref Unsafe.Add(ref d0, 1);
134+
ref Rgba32 d2 = ref Unsafe.Add(ref d0, 2);
135+
ref Rgba32 d3 = ref Unsafe.Add(ref d0, 3);
136+
137+
ref ByteTuple4 rr = ref Unsafe.Add(ref r, i);
138+
ref ByteTuple4 gg = ref Unsafe.Add(ref g, i);
139+
ref ByteTuple4 bb = ref Unsafe.Add(ref b, i);
140+
141+
d0.R = rr.V0;
142+
d0.G = gg.V0;
143+
d0.B = bb.V0;
144+
145+
d1.R = rr.V1;
146+
d1.G = gg.V1;
147+
d1.B = bb.V1;
148+
149+
d2.R = rr.V2;
150+
d2.G = gg.V2;
151+
d2.B = bb.V2;
152+
153+
d3.R = rr.V3;
154+
d3.G = gg.V3;
155+
d3.B = bb.V3;
156+
}
157+
158+
int finished = count * 4;
159+
redChannel = redChannel.Slice(finished);
160+
greenChannel = greenChannel.Slice(finished);
161+
blueChannel = blueChannel.Slice(finished);
162+
destination = destination.Slice(finished);
163+
}
164+
165+
private static void PackFromRgbPlanesRemainder(
166+
ReadOnlySpan<byte> redChannel,
167+
ReadOnlySpan<byte> greenChannel,
168+
ReadOnlySpan<byte> blueChannel,
169+
Span<Rgb24> destination)
170+
{
171+
ref byte r = ref MemoryMarshal.GetReference(redChannel);
172+
ref byte g = ref MemoryMarshal.GetReference(greenChannel);
173+
ref byte b = ref MemoryMarshal.GetReference(blueChannel);
174+
ref Rgb24 rgb = ref MemoryMarshal.GetReference(destination);
175+
176+
for (int i = 0; i < destination.Length; i++)
177+
{
178+
ref Rgb24 d = ref Unsafe.Add(ref rgb, i);
179+
d.R = Unsafe.Add(ref r, i);
180+
d.G = Unsafe.Add(ref g, i);
181+
d.B = Unsafe.Add(ref b, i);
182+
}
183+
}
184+
185+
private static void PackFromRgbPlanesRemainder(
186+
ReadOnlySpan<byte> redChannel,
187+
ReadOnlySpan<byte> greenChannel,
188+
ReadOnlySpan<byte> blueChannel,
189+
Span<Rgba32> destination)
190+
{
191+
ref byte r = ref MemoryMarshal.GetReference(redChannel);
192+
ref byte g = ref MemoryMarshal.GetReference(greenChannel);
193+
ref byte b = ref MemoryMarshal.GetReference(blueChannel);
194+
ref Rgba32 rgba = ref MemoryMarshal.GetReference(destination);
195+
196+
for (int i = 0; i < destination.Length; i++)
197+
{
198+
ref Rgba32 d = ref Unsafe.Add(ref rgba, i);
199+
d.R = Unsafe.Add(ref r, i);
200+
d.G = Unsafe.Add(ref g, i);
201+
d.B = Unsafe.Add(ref b, i);
202+
d.A = 255;
203+
}
204+
}
205+
}
206+
}

src/ImageSharp/Common/Helpers/SimdUtils.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Numerics;
77
using System.Runtime.CompilerServices;
88
using System.Runtime.InteropServices;
9+
using SixLabors.ImageSharp.PixelFormats;
910
#if SUPPORTS_RUNTIME_INTRINSICS
1011
using System.Runtime.Intrinsics;
1112
using System.Runtime.Intrinsics.X86;
@@ -220,5 +221,13 @@ private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest,
220221
nameof(source),
221222
$"length should be divisible by {shouldBeDivisibleBy}!");
222223
}
224+
225+
private struct ByteTuple4
226+
{
227+
public byte V0;
228+
public byte V1;
229+
public byte V2;
230+
public byte V3;
231+
}
223232
}
224233
}

0 commit comments

Comments
 (0)