Add optimized paths for default gamma exposure

Sergio0694 · Sergio0694 · commit e4ba017dcf24 · 2020-12-14T01:51:24.000+01:00
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -80,26 +80,46 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
             var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
 
             // Preliminary gamma highlight pass
-            var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
-            ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
-                this.Configuration,
-                sourceRectangle,
-                in gammaOperation);
+            if (this.gamma == 3F)
+            {
+                var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
+                ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
+                    this.Configuration,
+                    sourceRectangle,
+                    in gammaOperation);
+            }
+            else
+            {
+                var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
+                ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
+                    this.Configuration,
+                    sourceRectangle,
+                    in gammaOperation);
+            }
 
             // Create a 0-filled buffer to use to store the result of the component convolutions
             using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);
 
             // Perform the 1D convolutions on all the kernel components and accumulate the results
             this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);
 
-            float inverseGamma = 1 / this.gamma;
-
             // Apply the inverse gamma exposure pass, and write the final pixel data
-            var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
-            ParallelRowIterator.IterateRows(
-                this.Configuration,
-                sourceRectangle,
-                in operation);
+            if (this.gamma == 3F)
+            {
+                var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
+                ParallelRowIterator.IterateRows(
+                    this.Configuration,
+                    sourceRectangle,
+                    in operation);
+            }
+            else
+            {
+                var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
+                ParallelRowIterator.IterateRows(
+                    this.Configuration,
+                    sourceRectangle,
+                    in operation);
+            }
         }
 
         /// <summary>
@@ -286,6 +306,56 @@ public void Invoke(int y, Span<Vector4> span)
             }
         }
 
+        /// <summary>
+        /// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
+        /// </summary>
+        private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4>
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Configuration configuration;
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public ApplyGamma3ExposureRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Configuration configuration)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.configuration = configuration;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void Invoke(int y, Span<Vector4> span)
+            {
+                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
+                ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
+
+                for (int x = 0; x < this.bounds.Width; x++)
+                {
+                    ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
+                    Vector4 v = pixel4;
+                    float a = v.W;
+
+                    // Fast path for the default gamma exposure, which is 3. In this case we can skip
+                    // calling Math.Pow 3 times (one per component), as the method is an internal call and
+                    // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+                    // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+                    // back to the target index in the temporary span. The whole iteration will get completely
+                    // inlined and traslated into vectorized instructions, with much better performance.
+                    v = v * v * v;
+                    v.W = a;
+
+                    pixel4 = v;
+                }
+
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+            }
+        }
+
         /// <summary>
         /// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
         /// </summary>
@@ -335,5 +405,82 @@ public void Invoke(int y)
                 PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
             }
         }
+
+        /// <summary>
+        /// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
+        /// </summary>
+        private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Buffer2D<Vector4> sourceValues;
+            private readonly Configuration configuration;
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public ApplyInverseGamma3ExposureRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Buffer2D<Vector4> sourceValues,
+                Configuration configuration)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.sourceValues = sourceValues;
+                this.configuration = configuration;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public unsafe void Invoke(int y)
+            {
+                Vector4 low = Vector4.Zero;
+                var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);
+
+                Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+                Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X);
+                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
+
+                for (int x = 0; x < this.bounds.Width; x++)
+                {
+                    ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
+                    Vector4 clamp = Numerics.Clamp(v, low, high);
+
+                    double
+                        x64 = clamp.X,
+                        y64 = clamp.Y,
+                        z64 = clamp.Z;
+                    float a = clamp.W;
+
+                    ulong
+                        xl = *(ulong*)&x64,
+                        yl = *(ulong*)&y64,
+                        zl = *(ulong*)&z64;
+
+                    // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
+                    // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
+                    // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
+                    // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
+                    // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
+                    // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
+                    xl = 0x2a9f8a7be393b600 + (xl / 3);
+                    yl = 0x2a9f8a7be393b600 + (yl / 3);
+                    zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+                    Vector4 y4;
+                    y4.X = (float)*(double*)&xl;
+                    y4.Y = (float)*(double*)&yl;
+                    y4.Z = (float)*(double*)&zl;
+                    y4.W = 0;
+
+                    y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
+                    y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
+                    y4.W = a;
+
+                    v = y4;
+                }
+
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
+            }
+        }
     }
 }