Skip to content

Commit 7eb5cc0

Browse files
Merge pull request #1513 from SixLabors/sp/simd-resize-convolve
Speed improvements to resize kernel (w/ SIMD)
2 parents eab04e4 + e2211c3 commit 7eb5cc0

File tree

2 files changed

+89
-11
lines changed

2 files changed

+89
-11
lines changed

src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
using System;
55
using System.Numerics;
66
using System.Runtime.CompilerServices;
7+
#if SUPPORTS_RUNTIME_INTRINSICS
8+
using System.Runtime.InteropServices;
9+
using System.Runtime.Intrinsics;
10+
using System.Runtime.Intrinsics.X86;
11+
#endif
712

813
namespace SixLabors.ImageSharp.Processing.Processors.Transforms
914
{
@@ -66,21 +71,94 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
6671
[MethodImpl(InliningOptions.ShortMethod)]
6772
public Vector4 ConvolveCore(ref Vector4 rowStartRef)
6873
{
69-
ref float horizontalValues = ref Unsafe.AsRef<float>(this.bufferPtr);
74+
#if SUPPORTS_RUNTIME_INTRINSICS
75+
if (Fma.IsSupported)
76+
{
77+
float* bufferStart = this.bufferPtr;
78+
float* bufferEnd = bufferStart + (this.Length & ~3);
79+
Vector256<float> result256_0 = Vector256<float>.Zero;
80+
Vector256<float> result256_1 = Vector256<float>.Zero;
81+
ReadOnlySpan<byte> maskBytes = new byte[]
82+
{
83+
0, 0, 0, 0, 0, 0, 0, 0,
84+
0, 0, 0, 0, 0, 0, 0, 0,
85+
1, 0, 0, 0, 1, 0, 0, 0,
86+
1, 0, 0, 0, 1, 0, 0, 0,
87+
};
88+
Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
7089

71-
// Destination color components
72-
Vector4 result = Vector4.Zero;
90+
while (bufferStart < bufferEnd)
91+
{
92+
// It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
93+
// for the FMA operation, and execute it directly on the target register and reading directly from
94+
// memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
95+
// The code below should compile in the following assembly on .NET 5 x64:
96+
//
97+
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
98+
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
99+
// vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
100+
//
101+
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
102+
// Additionally, we're also unrolling two computations per each loop iterations to leverage the
103+
// fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
104+
result256_0 = Fma.MultiplyAdd(
105+
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
106+
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
107+
result256_0);
73108

74-
for (int i = 0; i < this.Length; i++)
75-
{
76-
float weight = Unsafe.Add(ref horizontalValues, i);
109+
result256_1 = Fma.MultiplyAdd(
110+
Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
111+
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
112+
result256_1);
113+
114+
bufferStart += 4;
115+
rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
116+
}
117+
118+
result256_0 = Avx.Add(result256_0, result256_1);
119+
120+
if ((this.Length & 3) >= 2)
121+
{
122+
result256_0 = Fma.MultiplyAdd(
123+
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
124+
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
125+
result256_0);
126+
127+
bufferStart += 2;
128+
rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
129+
}
77130

78-
// Vector4 v = offsetedRowSpan[i];
79-
Vector4 v = Unsafe.Add(ref rowStartRef, i);
80-
result += v * weight;
131+
Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
132+
133+
if ((this.Length & 1) != 0)
134+
{
135+
result128 = Fma.MultiplyAdd(
136+
Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
137+
Vector128.Create(*bufferStart),
138+
result128);
139+
}
140+
141+
return *(Vector4*)&result128;
81142
}
143+
else
144+
#endif
145+
{
146+
// Destination color components
147+
Vector4 result = Vector4.Zero;
148+
float* bufferStart = this.bufferPtr;
149+
float* bufferEnd = this.bufferPtr + this.Length;
82150

83-
return result;
151+
while (bufferStart < bufferEnd)
152+
{
153+
// Vector4 v = offsetedRowSpan[i];
154+
result += rowStartRef * *bufferStart;
155+
156+
bufferStart++;
157+
rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
158+
}
159+
160+
return result;
161+
}
84162
}
85163

86164
/// <summary>

tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ public void WorkingBufferSizeHintInBytes_IsAppliedCorrectly<TPixel>(
139139
testOutputDetails: workingBufferLimitInRows,
140140
appendPixelTypeToFileName: false);
141141
image.CompareToReferenceOutput(
142-
ImageComparer.TolerantPercentage(0.001f),
142+
ImageComparer.TolerantPercentage(0.004f),
143143
provider,
144144
testOutputDetails: workingBufferLimitInRows,
145145
appendPixelTypeToFileName: false);

0 commit comments

Comments
 (0)