Skip to content

Commit 120080b

Browse files
Merge pull request #1402 from SixLabors/js/vector4octet-pack
Add AVX2 Vector4Octet.Pack implementation
2 parents b5975a3 + 3ae4b02 commit 120080b

File tree

4 files changed

+103
-5
lines changed

4 files changed

+103
-5
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,32 @@ internal static partial class SimdUtils
1414
{
1515
public static class HwIntrinsics
1616
{
17-
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
17+
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
18+
19+
public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
20+
21+
/// <summary>
22+
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
23+
/// </summary>
24+
/// <param name="va">The vector to add to the intermediate result.</param>
25+
/// <param name="vm0">The first vector to multiply.</param>
26+
/// <param name="vm1">The second vector to multiply.</param>
27+
/// <returns>The <see cref="Vector256{T}"/>.</returns>
28+
[MethodImpl(InliningOptions.ShortMethod)]
29+
public static Vector256<float> MultiplyAdd(
30+
in Vector256<float> va,
31+
in Vector256<float> vm0,
32+
in Vector256<float> vm1)
33+
{
34+
if (Fma.IsSupported)
35+
{
36+
return Fma.MultiplyAdd(vm1, vm0, va);
37+
}
38+
else
39+
{
40+
return Avx.Add(Avx.Multiply(vm0, vm1), va);
41+
}
42+
}
1843

1944
/// <summary>
2045
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).

src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1-
// Copyright (c) Six Labors.
1+
// Copyright (c) Six Labors.
22
// Licensed under the Apache License, Version 2.0.
33

44
using System;
55
using System.Numerics;
66
using System.Runtime.CompilerServices;
77
using System.Runtime.InteropServices;
8-
8+
#if SUPPORTS_RUNTIME_INTRINSICS
9+
using System.Runtime.Intrinsics;
10+
using System.Runtime.Intrinsics.X86;
11+
using static SixLabors.ImageSharp.SimdUtils;
12+
#endif
913
using SixLabors.ImageSharp.Tuples;
1014

1115
// ReSharper disable ImpureMethodCallOnReadonlyValueField
@@ -47,6 +51,73 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
4751
"JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!");
4852
}
4953

54+
#if SUPPORTS_RUNTIME_INTRINSICS
55+
ref Vector256<float> yBase =
56+
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
57+
ref Vector256<float> cbBase =
58+
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
59+
ref Vector256<float> crBase =
60+
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
61+
62+
ref Vector256<float> resultBase =
63+
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
64+
65+
// Used for the color conversion
66+
var chromaOffset = Vector256.Create(-halfValue);
67+
var scale = Vector256.Create(1 / maxValue);
68+
var rCrMult = Vector256.Create(1.402F);
69+
var gCbMult = Vector256.Create(-0.344136F);
70+
var gCrMult = Vector256.Create(-0.714136F);
71+
var bCbMult = Vector256.Create(1.772F);
72+
73+
// Used for packing.
74+
var va = Vector256.Create(1F);
75+
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
76+
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);
77+
78+
// Walking 8 elements at one step:
79+
int n = result.Length / 8;
80+
for (int i = 0; i < n; i++)
81+
{
82+
// y = yVals[i];
83+
// cb = cbVals[i] - 128F;
84+
// cr = crVals[i] - 128F;
85+
Vector256<float> y = Unsafe.Add(ref yBase, i);
86+
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
87+
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);
88+
89+
y = Avx2.PermuteVar8x32(y, vcontrol);
90+
cb = Avx2.PermuteVar8x32(cb, vcontrol);
91+
cr = Avx2.PermuteVar8x32(cr, vcontrol);
92+
93+
// r = y + (1.402F * cr);
94+
// g = y - (0.344136F * cb) - (0.714136F * cr);
95+
// b = y + (1.772F * cb);
96+
// Adding & multiplying 8 elements at one time:
97+
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult);
98+
Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult);
99+
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult);
100+
101+
// TODO: We should be savving to RGBA not Vector4
102+
r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale);
103+
g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale);
104+
b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale);
105+
106+
Vector256<float> vte = Avx.UnpackLow(r, b);
107+
Vector256<float> vto = Avx.UnpackLow(g, va);
108+
109+
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
110+
111+
destination = Avx.UnpackLow(vte, vto);
112+
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);
113+
114+
vte = Avx.UnpackHigh(r, b);
115+
vto = Avx.UnpackHigh(g, va);
116+
117+
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
118+
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
119+
}
120+
#else
50121
ref Vector<float> yBase =
51122
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
52123
ref Vector<float> cbBase =
@@ -104,6 +175,7 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
104175
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
105176
destination.Pack(ref rr, ref gg, ref bb);
106177
}
178+
#endif
107179
}
108180
}
109181
}

src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
using System;
55
using System.Collections.Generic;
66
using System.Numerics;
7-
87
using SixLabors.ImageSharp.Memory;
98
using SixLabors.ImageSharp.Tuples;
109

tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,9 @@ public HwIntrinsics_SSE_AVX()
7373
}
7474
#endif
7575
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
76-
.WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off))
76+
.WithEnvironmentVariables(
77+
new EnvironmentVariable(EnableHWIntrinsic, Off),
78+
new EnvironmentVariable(FeatureSIMD, Off))
7779
.WithId("No HwIntrinsics"));
7880
}
7981
}

0 commit comments

Comments
 (0)