Skip to content

Commit 7a7080c

Browse files
Merge pull request #1475 from SixLabors/sp/bokeh-blur-speedup
Optimize bokeh blur convolution
2 parents 943830f + 80617a0 commit 7a7080c

File tree

9 files changed

+465
-177
lines changed

9 files changed

+465
-177
lines changed

src/ImageSharp/Common/Helpers/Buffer2DUtils.cs

Lines changed: 0 additions & 109 deletions
This file was deleted.

src/ImageSharp/Common/Helpers/Numerics.cs

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,5 +547,140 @@ public static void UnPremultiply(Span<Vector4> vectors)
547547
}
548548
}
549549
}
550+
551+
/// <summary>
552+
/// Calculates the cube pow of all the XYZ channels of the input vectors.
553+
/// </summary>
554+
/// <param name="vectors">The span of vectors</param>
555+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
556+
public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
557+
{
558+
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
559+
ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length);
560+
561+
while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
562+
{
563+
Vector4 v = baseRef;
564+
float a = v.W;
565+
566+
// Fast path for the default gamma exposure, which is 3. In this case we can skip
567+
// calling Math.Pow 3 times (one per component), as the method is an internal call and
568+
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
569+
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
570+
// back to the target index in the temporary span. The whole iteration will get completely
571+
// inlined and traslated into vectorized instructions, with much better performance.
572+
v = v * v * v;
573+
v.W = a;
574+
575+
baseRef = v;
576+
baseRef = ref Unsafe.Add(ref baseRef, 1);
577+
}
578+
}
579+
580+
/// <summary>
581+
/// Calculates the cube root of all the XYZ channels of the input vectors.
582+
/// </summary>
583+
/// <param name="vectors">The span of vectors</param>
584+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
585+
public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
586+
{
587+
#if SUPPORTS_RUNTIME_INTRINSICS
588+
if (Sse41.IsSupported)
589+
{
590+
ref Vector128<float> vectors128Ref = ref Unsafe.As<Vector4, Vector128<float>>(ref MemoryMarshal.GetReference(vectors));
591+
ref Vector128<float> vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);
592+
593+
var v128_341 = Vector128.Create(341);
594+
Vector128<int> v128_negativeZero = Vector128.Create(-0.0f).AsInt32();
595+
Vector128<int> v128_one = Vector128.Create(1.0f).AsInt32();
596+
597+
var v128_13rd = Vector128.Create(1 / 3f);
598+
var v128_23rds = Vector128.Create(2 / 3f);
599+
600+
while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
601+
{
602+
Vector128<float> vecx = vectors128Ref;
603+
Vector128<int> veax = vecx.AsInt32();
604+
605+
// If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
606+
// directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
607+
// https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
608+
// Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
609+
// using a different succession (the same used below), which should be less unstable due to not having cube pow.
610+
veax = Sse2.AndNot(v128_negativeZero, veax);
611+
veax = Sse2.Subtract(veax, v128_one);
612+
veax = Sse2.ShiftRightArithmetic(veax, 10);
613+
veax = Sse41.MultiplyLow(veax, v128_341);
614+
veax = Sse2.Add(veax, v128_one);
615+
veax = Sse2.AndNot(v128_negativeZero, veax);
616+
veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero));
617+
618+
Vector128<float> y4 = veax.AsSingle();
619+
620+
if (Fma.IsSupported)
621+
{
622+
y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
623+
y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
624+
}
625+
else
626+
{
627+
y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
628+
y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
629+
}
630+
631+
y4 = Sse41.Insert(y4, vecx, 0xF0);
632+
633+
vectors128Ref = y4;
634+
vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
635+
}
636+
637+
return;
638+
}
639+
#endif
640+
ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
641+
ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);
642+
643+
// Fallback with scalar preprocessing and vectorized approximation steps
644+
while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
645+
{
646+
Vector4 v = vectorsRef;
647+
648+
double
649+
x64 = v.X,
650+
y64 = v.Y,
651+
z64 = v.Z;
652+
float a = v.W;
653+
654+
ulong
655+
xl = *(ulong*)&x64,
656+
yl = *(ulong*)&y64,
657+
zl = *(ulong*)&z64;
658+
659+
// Here we use a trick to compute the starting value x0 for the cube root. This is because doing
660+
// pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
661+
// this means what we actually want is to find the cube root of our clamped values.
662+
// For more info on the constant below, see:
663+
// https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
664+
// Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
665+
// store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
666+
// register, and use it to accelerate two steps of the Newton approximation using SIMD.
667+
xl = 0x2a9f8a7be393b600 + (xl / 3);
668+
yl = 0x2a9f8a7be393b600 + (yl / 3);
669+
zl = 0x2a9f8a7be393b600 + (zl / 3);
670+
671+
Vector4 y4;
672+
y4.X = (float)*(double*)&xl;
673+
y4.Y = (float)*(double*)&yl;
674+
y4.Z = (float)*(double*)&zl;
675+
y4.W = 0;
676+
677+
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
678+
y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
679+
y4.W = a;
680+
681+
vectorsRef = y4;
682+
vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
683+
}
684+
}
550685
}
551686
}

src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System;
55
using System.Numerics;
66
using System.Runtime.CompilerServices;
7+
using System.Runtime.InteropServices;
78
using SixLabors.ImageSharp.Advanced;
89
using SixLabors.ImageSharp.Memory;
910
using SixLabors.ImageSharp.PixelFormats;
@@ -91,31 +92,30 @@ public IImageProcessor<TPixel> CreatePixelSpecificProcessor<TPixel>(Configuratio
9192
/// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
9293
/// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
9394
/// </remarks>
94-
internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
95+
internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
9596
{
9697
private readonly Rectangle bounds;
9798
private readonly Buffer2D<Vector4> targetValues;
9899
private readonly Buffer2D<ComplexVector4> sourceValues;
100+
private readonly KernelSamplingMap map;
99101
private readonly Complex64[] kernel;
100102
private readonly float z;
101103
private readonly float w;
102-
private readonly int maxY;
103-
private readonly int maxX;
104104

105105
[MethodImpl(InliningOptions.ShortMethod)]
106-
public ApplyHorizontalConvolutionRowOperation(
106+
public SecondPassConvolutionRowOperation(
107107
Rectangle bounds,
108108
Buffer2D<Vector4> targetValues,
109109
Buffer2D<ComplexVector4> sourceValues,
110+
KernelSamplingMap map,
110111
Complex64[] kernel,
111112
float z,
112113
float w)
113114
{
114115
this.bounds = bounds;
115-
this.maxY = this.bounds.Bottom - 1;
116-
this.maxX = this.bounds.Right - 1;
117116
this.targetValues = targetValues;
118117
this.sourceValues = sourceValues;
118+
this.map = map;
119119
this.kernel = kernel;
120120
this.z = z;
121121
this.w = w;
@@ -125,11 +125,33 @@ public ApplyHorizontalConvolutionRowOperation(
125125
[MethodImpl(InliningOptions.ShortMethod)]
126126
public void Invoke(int y)
127127
{
128-
Span<Vector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
128+
int boundsX = this.bounds.X;
129+
int boundsWidth = this.bounds.Width;
130+
int kernelSize = this.kernel.Length;
129131

130-
for (int x = 0; x < this.bounds.Width; x++)
132+
Span<int> rowOffsets = this.map.GetRowOffsetSpan();
133+
ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
134+
135+
// The target buffer is zeroed initially and then it accumulates the results
136+
// of each partial convolution, so we don't have to clear it here as well
137+
ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
138+
ref Complex64 kernelBase = ref this.kernel[0];
139+
140+
for (int kY = 0; kY < kernelSize; kY++)
131141
{
132-
Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
142+
// Get the precalculated source sample row for this kernel row and copy to our buffer
143+
int sampleY = Unsafe.Add(ref sampleRowBase, kY);
144+
ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
145+
Complex64 factor = Unsafe.Add(ref kernelBase, kY);
146+
147+
for (int x = 0; x < boundsWidth; x++)
148+
{
149+
ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
150+
ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
151+
ComplexVector4 partial = factor * sample;
152+
153+
target += partial.WeightedSum(this.z, this.w);
154+
}
133155
}
134156
}
135157
}

0 commit comments

Comments
 (0)