Skip to content

Commit e4ba017

Browse files
committed
Add optimized paths for default gamma exposure
1 parent 2e53a44 commit e4ba017

File tree

1 file changed

+159
-12
lines changed

1 file changed

+159
-12
lines changed

src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs

Lines changed: 159 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,26 +80,46 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
8080
var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
8181

8282
// Preliminary gamma highlight pass
83-
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
84-
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
85-
this.Configuration,
86-
sourceRectangle,
87-
in gammaOperation);
83+
if (this.gamma == 3F)
84+
{
85+
var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
86+
ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
87+
this.Configuration,
88+
sourceRectangle,
89+
in gammaOperation);
90+
}
91+
else
92+
{
93+
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
94+
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
95+
this.Configuration,
96+
sourceRectangle,
97+
in gammaOperation);
98+
}
8899

89100
// Create a 0-filled buffer to use to store the result of the component convolutions
90101
using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);
91102

92103
// Perform the 1D convolutions on all the kernel components and accumulate the results
93104
this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);
94105

95-
float inverseGamma = 1 / this.gamma;
96-
97106
// Apply the inverse gamma exposure pass, and write the final pixel data
98-
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
99-
ParallelRowIterator.IterateRows(
100-
this.Configuration,
101-
sourceRectangle,
102-
in operation);
107+
if (this.gamma == 3F)
108+
{
109+
var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
110+
ParallelRowIterator.IterateRows(
111+
this.Configuration,
112+
sourceRectangle,
113+
in operation);
114+
}
115+
else
116+
{
117+
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
118+
ParallelRowIterator.IterateRows(
119+
this.Configuration,
120+
sourceRectangle,
121+
in operation);
122+
}
103123
}
104124

105125
/// <summary>
@@ -286,6 +306,56 @@ public void Invoke(int y, Span<Vector4> span)
286306
}
287307
}
288308

309+
/// <summary>
310+
/// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
311+
/// </summary>
312+
private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4>
313+
{
314+
private readonly Rectangle bounds;
315+
private readonly Buffer2D<TPixel> targetPixels;
316+
private readonly Configuration configuration;
317+
318+
[MethodImpl(InliningOptions.ShortMethod)]
319+
public ApplyGamma3ExposureRowOperation(
320+
Rectangle bounds,
321+
Buffer2D<TPixel> targetPixels,
322+
Configuration configuration)
323+
{
324+
this.bounds = bounds;
325+
this.targetPixels = targetPixels;
326+
this.configuration = configuration;
327+
}
328+
329+
/// <inheritdoc/>
330+
[MethodImpl(InliningOptions.ShortMethod)]
331+
public void Invoke(int y, Span<Vector4> span)
332+
{
333+
Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
334+
PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
335+
ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
336+
337+
for (int x = 0; x < this.bounds.Width; x++)
338+
{
339+
ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
340+
Vector4 v = pixel4;
341+
float a = v.W;
342+
343+
// Fast path for the default gamma exposure, which is 3. In this case we can skip
344+
// calling Math.Pow 3 times (one per component), as the method is an internal call and
345+
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
346+
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
347+
// back to the target index in the temporary span. The whole iteration will get completely
348+
// inlined and traslated into vectorized instructions, with much better performance.
349+
v = v * v * v;
350+
v.W = a;
351+
352+
pixel4 = v;
353+
}
354+
355+
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
356+
}
357+
}
358+
289359
/// <summary>
290360
/// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
291361
/// </summary>
@@ -335,5 +405,82 @@ public void Invoke(int y)
335405
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
336406
}
337407
}
408+
409+
/// <summary>
410+
/// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
411+
/// </summary>
412+
private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
413+
{
414+
private readonly Rectangle bounds;
415+
private readonly Buffer2D<TPixel> targetPixels;
416+
private readonly Buffer2D<Vector4> sourceValues;
417+
private readonly Configuration configuration;
418+
419+
[MethodImpl(InliningOptions.ShortMethod)]
420+
public ApplyInverseGamma3ExposureRowOperation(
421+
Rectangle bounds,
422+
Buffer2D<TPixel> targetPixels,
423+
Buffer2D<Vector4> sourceValues,
424+
Configuration configuration)
425+
{
426+
this.bounds = bounds;
427+
this.targetPixels = targetPixels;
428+
this.sourceValues = sourceValues;
429+
this.configuration = configuration;
430+
}
431+
432+
/// <inheritdoc/>
433+
[MethodImpl(InliningOptions.ShortMethod)]
434+
public unsafe void Invoke(int y)
435+
{
436+
Vector4 low = Vector4.Zero;
437+
var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);
438+
439+
Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
440+
Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X);
441+
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
442+
443+
for (int x = 0; x < this.bounds.Width; x++)
444+
{
445+
ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
446+
Vector4 clamp = Numerics.Clamp(v, low, high);
447+
448+
double
449+
x64 = clamp.X,
450+
y64 = clamp.Y,
451+
z64 = clamp.Z;
452+
float a = clamp.W;
453+
454+
ulong
455+
xl = *(ulong*)&x64,
456+
yl = *(ulong*)&y64,
457+
zl = *(ulong*)&z64;
458+
459+
// Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
460+
// of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
461+
// constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
462+
// the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
463+
// these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
464+
// As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
465+
xl = 0x2a9f8a7be393b600 + (xl / 3);
466+
yl = 0x2a9f8a7be393b600 + (yl / 3);
467+
zl = 0x2a9f8a7be393b600 + (zl / 3);
468+
469+
Vector4 y4;
470+
y4.X = (float)*(double*)&xl;
471+
y4.Y = (float)*(double*)&yl;
472+
y4.Z = (float)*(double*)&zl;
473+
y4.W = 0;
474+
475+
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
476+
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
477+
y4.W = a;
478+
479+
v = y4;
480+
}
481+
482+
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
483+
}
484+
}
338485
}
339486
}

0 commit comments

Comments
 (0)