Skip to content

Commit 74dd8cd

Browse files
Use ROS trick all round and optimize Shuffle3
1 parent a46fb9b commit 74dd8cd

File tree

1 file changed

+20
-16
lines changed

1 file changed

+20
-16
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ public static class HwIntrinsics
1818

1919
public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
2020

21+
private static ReadOnlySpan<byte> ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 };
22+
23+
private static ReadOnlySpan<byte> ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 };
24+
2125
/// <summary>
2226
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
2327
/// using the control and store the results in <paramref name="dest"/>.
@@ -352,10 +356,12 @@ private static void Shuffle3(
352356
{
353357
if (Ssse3.IsSupported)
354358
{
355-
Vector128<byte> vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte();
359+
ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16);
360+
Vector128<byte> vmask = Unsafe.As<byte, Vector128<byte>>(ref vmaskBase);
356361
Vector128<byte> vfill = Vector128.Create(0xff000000ff000000ul).AsByte();
357-
Vector128<byte> vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte();
358-
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte();
362+
ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16);
363+
Vector128<byte> vmasko = Unsafe.As<byte, Vector128<byte>>(ref vmaskoBase);
364+
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12);
359365

360366
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
361367
Shuffle.MmShuffleSpan(ref bytes, control);
@@ -381,23 +387,19 @@ private static void Shuffle3(
381387
v2 = Ssse3.AlignRight(v2, v1, 8);
382388
v1 = Ssse3.AlignRight(v1, v0, 12);
383389

384-
v0 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle);
385-
v1 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle);
386-
v2 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle);
387-
v3 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle);
390+
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle);
391+
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle);
392+
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle);
393+
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle);
388394

389395
v0 = Ssse3.Shuffle(v0, vmaske);
390396
v1 = Ssse3.Shuffle(v1, vmasko);
391397
v2 = Ssse3.Shuffle(v2, vmaske);
392398
v3 = Ssse3.Shuffle(v3, vmasko);
393399

394400
v0 = Ssse3.AlignRight(v1, v0, 4);
395-
v3 = Ssse3.AlignRight(v3, v2, 12);
396-
397-
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4);
398-
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4);
399-
400-
v1 = Ssse3.AlignRight(v2, v1, 8);
401+
v1 = Sse2.Or(Sse2.ShiftRightLogical128BitLane(v1, 4), Sse2.ShiftLeftLogical128BitLane(v2, 4));
402+
v2 = Ssse3.AlignRight(v3, v2, 12);
401403

402404
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, i);
403405

@@ -416,7 +418,8 @@ private static void Pad3Shuffle4(
416418
{
417419
if (Ssse3.IsSupported)
418420
{
419-
Vector128<byte> vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte();
421+
ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16);
422+
Vector128<byte> vmask = Unsafe.As<byte, Vector128<byte>>(ref vmaskBase);
420423
Vector128<byte> vfill = Vector128.Create(0xff000000ff000000ul).AsByte();
421424

422425
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
@@ -459,8 +462,9 @@ private static void Shuffle4Slice3(
459462
{
460463
if (Ssse3.IsSupported)
461464
{
462-
Vector128<byte> vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte();
463-
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte();
465+
ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16);
466+
Vector128<byte> vmasko = Unsafe.As<byte, Vector128<byte>>(ref vmaskoBase);
467+
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12);
464468

465469
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
466470
Shuffle.MmShuffleSpan(ref bytes, control);

0 commit comments

Comments
 (0)