|
4 | 4 | using System; |
5 | 5 | using System.Numerics; |
6 | 6 | using System.Runtime.CompilerServices; |
| 7 | +#if SUPPORTS_RUNTIME_INTRINSICS |
| 8 | +using System.Runtime.InteropServices; |
| 9 | +using System.Runtime.Intrinsics; |
| 10 | +using System.Runtime.Intrinsics.X86; |
| 11 | +#endif |
7 | 12 |
|
8 | 13 | namespace SixLabors.ImageSharp.Processing.Processors.Transforms |
9 | 14 | { |
@@ -66,21 +71,94 @@ public Vector4 Convolve(Span<Vector4> rowSpan) |
66 | 71 | [MethodImpl(InliningOptions.ShortMethod)] |
67 | 72 | public Vector4 ConvolveCore(ref Vector4 rowStartRef) |
68 | 73 | { |
69 | | - ref float horizontalValues = ref Unsafe.AsRef<float>(this.bufferPtr); |
| 74 | +#if SUPPORTS_RUNTIME_INTRINSICS |
| 75 | + if (Fma.IsSupported) |
| 76 | + { |
| 77 | + float* bufferStart = this.bufferPtr; |
| 78 | + float* bufferEnd = bufferStart + (this.Length & ~3); |
| 79 | + Vector256<float> result256_0 = Vector256<float>.Zero; |
| 80 | + Vector256<float> result256_1 = Vector256<float>.Zero; |
| 81 | + ReadOnlySpan<byte> maskBytes = new byte[] |
| 82 | + { |
| 83 | + 0, 0, 0, 0, 0, 0, 0, 0, |
| 84 | + 0, 0, 0, 0, 0, 0, 0, 0, |
| 85 | + 1, 0, 0, 0, 1, 0, 0, 0, |
| 86 | + 1, 0, 0, 0, 1, 0, 0, 0, |
| 87 | + }; |
| 88 | + Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes)); |
70 | 89 |
|
71 | | - // Destination color components |
72 | | - Vector4 result = Vector4.Zero; |
| 90 | + while (bufferStart < bufferEnd) |
| 91 | + { |
| 92 | + // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps |
| 93 | + // for the FMA operation, and execute it directly on the target register and reading directly from |
| 94 | + // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. |
| 95 | + // The code below should compile in the following assembly on .NET 5 x64: |
| 96 | + // |
| 97 | + // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] |
| 98 | + // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] |
| 99 | + // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 |
| 100 | + // |
| 101 | + // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. |
| 102 | + // Additionally, we're also unrolling two computations per each loop iterations to leverage the |
| 103 | + // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. |
| 104 | + result256_0 = Fma.MultiplyAdd( |
| 105 | + Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef), |
| 106 | + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), |
| 107 | + result256_0); |
73 | 108 |
|
74 | | - for (int i = 0; i < this.Length; i++) |
75 | | - { |
76 | | - float weight = Unsafe.Add(ref horizontalValues, i); |
| 109 | + result256_1 = Fma.MultiplyAdd( |
| 110 | + Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)), |
| 111 | + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), |
| 112 | + result256_1); |
| 113 | + |
| 114 | + bufferStart += 4; |
| 115 | + rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); |
| 116 | + } |
| 117 | + |
| 118 | + result256_0 = Avx.Add(result256_0, result256_1); |
| 119 | + |
| 120 | + if ((this.Length & 3) >= 2) |
| 121 | + { |
| 122 | + result256_0 = Fma.MultiplyAdd( |
| 123 | + Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef), |
| 124 | + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), |
| 125 | + result256_0); |
| 126 | + |
| 127 | + bufferStart += 2; |
| 128 | + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); |
| 129 | + } |
77 | 130 |
|
78 | | - // Vector4 v = offsetedRowSpan[i]; |
79 | | - Vector4 v = Unsafe.Add(ref rowStartRef, i); |
80 | | - result += v * weight; |
| 131 | + Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); |
| 132 | + |
| 133 | + if ((this.Length & 1) != 0) |
| 134 | + { |
| 135 | + result128 = Fma.MultiplyAdd( |
| 136 | + Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef), |
| 137 | + Vector128.Create(*bufferStart), |
| 138 | + result128); |
| 139 | + } |
| 140 | + |
| 141 | + return *(Vector4*)&result128; |
81 | 142 | } |
| 143 | + else |
| 144 | +#endif |
| 145 | + { |
| 146 | + // Destination color components |
| 147 | + Vector4 result = Vector4.Zero; |
| 148 | + float* bufferStart = this.bufferPtr; |
| 149 | + float* bufferEnd = this.bufferPtr + this.Length; |
82 | 150 |
|
83 | | - return result; |
| 151 | + while (bufferStart < bufferEnd) |
| 152 | + { |
| 153 | + // Vector4 v = offsetedRowSpan[i]; |
| 154 | + result += rowStartRef * *bufferStart; |
| 155 | + |
| 156 | + bufferStart++; |
| 157 | + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); |
| 158 | + } |
| 159 | + |
| 160 | + return result; |
| 161 | + } |
84 | 162 | } |
85 | 163 |
|
86 | 164 | /// <summary> |
|
0 commit comments