SixLabors
diff --git a/‎src/ImageSharp/Common/Helpers/Buffer2DUtils.cs‎
Lines changed: 0 additions & 109 deletions b/‎src/ImageSharp/Common/Helpers/Buffer2DUtils.cs‎
Lines changed: 0 additions & 109 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Numerics.cs‎
Lines changed: 135 additions & 0 deletions b/‎src/ImageSharp/Common/Helpers/Numerics.cs‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs‎
Lines changed: 31 additions & 9 deletions b/‎src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs‎
Lines changed: 31 additions & 9 deletions
@@ -547,5 +547,140 @@ public static void UnPremultiply(Span<Vector4> vectors)
                 }
             }
         }
+
+        /// <summary>
+        /// Calculates the cube pow of all the XYZ channels of the input vectors.
+        /// </summary>
+        /// <param name="vectors">The span of vectors</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
+        {
+            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length);
+
+            while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
+            {
+                Vector4 v = baseRef;
+                float a = v.W;
+
+                // Fast path for the default gamma exposure, which is 3. In this case we can skip
+                // calling Math.Pow 3 times (one per component), as the method is an internal call and
+                // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+                // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+                // back to the target index in the temporary span. The whole iteration will get completely
+                // inlined and traslated into vectorized instructions, with much better performance.
+                v = v * v * v;
+                v.W = a;
+
+                baseRef = v;
+                baseRef = ref Unsafe.Add(ref baseRef, 1);
+            }
+        }
+
+        /// <summary>
+        /// Calculates the cube root of all the XYZ channels of the input vectors.
+        /// </summary>
+        /// <param name="vectors">The span of vectors</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                ref Vector128<float> vectors128Ref = ref Unsafe.As<Vector4, Vector128<float>>(ref MemoryMarshal.GetReference(vectors));
+                ref Vector128<float> vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);
+
+                var v128_341 = Vector128.Create(341);
+                Vector128<int> v128_negativeZero = Vector128.Create(-0.0f).AsInt32();
+                Vector128<int> v128_one = Vector128.Create(1.0f).AsInt32();
+
+                var v128_13rd = Vector128.Create(1 / 3f);
+                var v128_23rds = Vector128.Create(2 / 3f);
+
+                while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
+                {
+                    Vector128<float> vecx = vectors128Ref;
+                    Vector128<int> veax = vecx.AsInt32();
+
+                    // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
+                    // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
+                    // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
+                    // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
+                    // using a different succession (the same used below), which should be less unstable due to not having cube pow.
+                    veax = Sse2.AndNot(v128_negativeZero, veax);
+                    veax = Sse2.Subtract(veax, v128_one);
+                    veax = Sse2.ShiftRightArithmetic(veax, 10);
+                    veax = Sse41.MultiplyLow(veax, v128_341);
+                    veax = Sse2.Add(veax, v128_one);
+                    veax = Sse2.AndNot(v128_negativeZero, veax);
+                    veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero));
+
+                    Vector128<float> y4 = veax.AsSingle();
+
+                    if (Fma.IsSupported)
+                    {
+                        y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                        y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    }
+                    else
+                    {
+                        y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                        y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    }
+
+                    y4 = Sse41.Insert(y4, vecx, 0xF0);
+
+                    vectors128Ref = y4;
+                    vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
+                }
+
+                return;
+            }
+#endif
+            ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);
+
+            // Fallback with scalar preprocessing and vectorized approximation steps
+            while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
+            {
+                Vector4 v = vectorsRef;
+
+                double
+                    x64 = v.X,
+                    y64 = v.Y,
+                    z64 = v.Z;
+                float a = v.W;
+
+                ulong
+                    xl = *(ulong*)&x64,
+                    yl = *(ulong*)&y64,
+                    zl = *(ulong*)&z64;
+
+                // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
+                // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
+                // this means what we actually want is to find the cube root of our clamped values.
+                // For more info on the  constant below, see:
+                // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
+                // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
+                // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
+                // register, and use it to accelerate two steps of the Newton approximation using SIMD.
+                xl = 0x2a9f8a7be393b600 + (xl / 3);
+                yl = 0x2a9f8a7be393b600 + (yl / 3);
+                zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+                Vector4 y4;
+                y4.X = (float)*(double*)&xl;
+                y4.Y = (float)*(double*)&yl;
+                y4.Z = (float)*(double*)&zl;
+                y4.W = 0;
+
+                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                y4.W = a;
+
+                vectorsRef = y4;
+                vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
+            }
+        }
     }
 }
@@ -4,6 +4,7 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@@ -91,31 +92,30 @@ public IImageProcessor<TPixel> CreatePixelSpecificProcessor<TPixel>(Configuratio
         /// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
         /// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
         /// </remarks>
-        internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
+        internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
         {
             private readonly Rectangle bounds;
             private readonly Buffer2D<Vector4> targetValues;
             private readonly Buffer2D<ComplexVector4> sourceValues;
+            private readonly KernelSamplingMap map;
             private readonly Complex64[] kernel;
             private readonly float z;
             private readonly float w;
-            private readonly int maxY;
-            private readonly int maxX;
 
             [MethodImpl(InliningOptions.ShortMethod)]
-            public ApplyHorizontalConvolutionRowOperation(
+            public SecondPassConvolutionRowOperation(
                 Rectangle bounds,
                 Buffer2D<Vector4> targetValues,
                 Buffer2D<ComplexVector4> sourceValues,
+                KernelSamplingMap map,
                 Complex64[] kernel,
                 float z,
                 float w)
             {
                 this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
                 this.targetValues = targetValues;
                 this.sourceValues = sourceValues;
+                this.map = map;
                 this.kernel = kernel;
                 this.z = z;
                 this.w = w;
@@ -125,11 +125,33 @@ public ApplyHorizontalConvolutionRowOperation(
             [MethodImpl(InliningOptions.ShortMethod)]
             public void Invoke(int y)
             {
-                Span<Vector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;
 
-                for (int x = 0; x < this.bounds.Width; x++)
+                Span<int> rowOffsets = this.map.GetRowOffsetSpan();
+                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
+
+                // The target buffer is zeroed initially and then it accumulates the results
+                // of each partial convolution, so we don't have to clear it here as well
+                ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
+                ref Complex64 kernelBase = ref this.kernel[0];
+
+                for (int kY = 0; kY < kernelSize; kY++)
                 {
-                    Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer
+                    int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
+                    Complex64 factor = Unsafe.Add(ref kernelBase, kY);
+
+                    for (int x = 0; x < boundsWidth; x++)
+                    {
+                        ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+                        ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
+                        ComplexVector4 partial = factor * sample;
+
+                        target += partial.WeightedSum(this.z, this.w);
+                    }
                 }
             }
         }