@@ -80,26 +80,46 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
8080 var sourceRectangle = Rectangle . Intersect ( this . SourceRectangle , source . Bounds ( ) ) ;
8181
8282 // Preliminary gamma highlight pass
83- var gammaOperation = new ApplyGammaExposureRowOperation ( sourceRectangle , source . PixelBuffer , this . Configuration , this . gamma ) ;
84- ParallelRowIterator . IterateRows < ApplyGammaExposureRowOperation , Vector4 > (
85- this . Configuration ,
86- sourceRectangle ,
87- in gammaOperation ) ;
83+ if ( this . gamma == 3F )
84+ {
85+ var gammaOperation = new ApplyGamma3ExposureRowOperation ( sourceRectangle , source . PixelBuffer , this . Configuration ) ;
86+ ParallelRowIterator . IterateRows < ApplyGamma3ExposureRowOperation , Vector4 > (
87+ this . Configuration ,
88+ sourceRectangle ,
89+ in gammaOperation ) ;
90+ }
91+ else
92+ {
93+ var gammaOperation = new ApplyGammaExposureRowOperation ( sourceRectangle , source . PixelBuffer , this . Configuration , this . gamma ) ;
94+ ParallelRowIterator . IterateRows < ApplyGammaExposureRowOperation , Vector4 > (
95+ this . Configuration ,
96+ sourceRectangle ,
97+ in gammaOperation ) ;
98+ }
8899
89100 // Create a 0-filled buffer to use to store the result of the component convolutions
90101 using Buffer2D < Vector4 > processingBuffer = this . Configuration . MemoryAllocator . Allocate2D < Vector4 > ( source . Size ( ) , AllocationOptions . Clean ) ;
91102
92103 // Perform the 1D convolutions on all the kernel components and accumulate the results
93104 this . OnFrameApplyCore ( source , sourceRectangle , this . Configuration , processingBuffer ) ;
94105
95- float inverseGamma = 1 / this . gamma ;
96-
97106 // Apply the inverse gamma exposure pass, and write the final pixel data
98- var operation = new ApplyInverseGammaExposureRowOperation ( sourceRectangle , source . PixelBuffer , processingBuffer , this . Configuration , inverseGamma ) ;
99- ParallelRowIterator . IterateRows (
100- this . Configuration ,
101- sourceRectangle ,
102- in operation ) ;
107+ if ( this . gamma == 3F )
108+ {
109+ var operation = new ApplyInverseGamma3ExposureRowOperation ( sourceRectangle , source . PixelBuffer , processingBuffer , this . Configuration ) ;
110+ ParallelRowIterator . IterateRows (
111+ this . Configuration ,
112+ sourceRectangle ,
113+ in operation ) ;
114+ }
115+ else
116+ {
117+ var operation = new ApplyInverseGammaExposureRowOperation ( sourceRectangle , source . PixelBuffer , processingBuffer , this . Configuration , 1 / this . gamma ) ;
118+ ParallelRowIterator . IterateRows (
119+ this . Configuration ,
120+ sourceRectangle ,
121+ in operation ) ;
122+ }
103123 }
104124
105125 /// <summary>
@@ -286,6 +306,56 @@ public void Invoke(int y, Span<Vector4> span)
286306 }
287307 }
288308
309+ /// <summary>
310+ /// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
311+ /// </summary>
312+ private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation < Vector4 >
313+ {
314+ private readonly Rectangle bounds ;
315+ private readonly Buffer2D < TPixel > targetPixels ;
316+ private readonly Configuration configuration ;
317+
318+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
319+ public ApplyGamma3ExposureRowOperation (
320+ Rectangle bounds ,
321+ Buffer2D < TPixel > targetPixels ,
322+ Configuration configuration )
323+ {
324+ this . bounds = bounds ;
325+ this . targetPixels = targetPixels ;
326+ this . configuration = configuration ;
327+ }
328+
329+ /// <inheritdoc/>
330+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
331+ public void Invoke ( int y , Span < Vector4 > span )
332+ {
333+ Span < TPixel > targetRowSpan = this . targetPixels . GetRowSpan ( y ) . Slice ( this . bounds . X ) ;
334+ PixelOperations < TPixel > . Instance . ToVector4 ( this . configuration , targetRowSpan . Slice ( 0 , span . Length ) , span , PixelConversionModifiers . Premultiply ) ;
335+ ref Vector4 baseRef = ref MemoryMarshal . GetReference ( span ) ;
336+
337+ for ( int x = 0 ; x < this . bounds . Width ; x ++ )
338+ {
339+ ref Vector4 pixel4 = ref Unsafe . Add ( ref baseRef , x ) ;
340+ Vector4 v = pixel4 ;
341+ float a = v . W ;
342+
343+ // Fast path for the default gamma exposure, which is 3. In this case we can skip
344+ // calling Math.Pow 3 times (one per component), as the method is an internal call and
345+ // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
346+ // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
347+ // back to the target index in the temporary span. The whole iteration will get completely
348+ // inlined and traslated into vectorized instructions, with much better performance.
349+ v = v * v * v ;
350+ v . W = a ;
351+
352+ pixel4 = v ;
353+ }
354+
355+ PixelOperations < TPixel > . Instance . FromVector4Destructive ( this . configuration , span , targetRowSpan ) ;
356+ }
357+ }
358+
289359 /// <summary>
290360 /// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
291361 /// </summary>
@@ -335,5 +405,82 @@ public void Invoke(int y)
335405 PixelOperations < TPixel > . Instance . FromVector4Destructive ( this . configuration , sourceRowSpan . Slice ( 0 , this . bounds . Width ) , targetPixelSpan , PixelConversionModifiers . Premultiply ) ;
336406 }
337407 }
408+
409+ /// <summary>
410+ /// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
411+ /// </summary>
412+ private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
413+ {
414+ private readonly Rectangle bounds ;
415+ private readonly Buffer2D < TPixel > targetPixels ;
416+ private readonly Buffer2D < Vector4 > sourceValues ;
417+ private readonly Configuration configuration ;
418+
419+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
420+ public ApplyInverseGamma3ExposureRowOperation (
421+ Rectangle bounds ,
422+ Buffer2D < TPixel > targetPixels ,
423+ Buffer2D < Vector4 > sourceValues ,
424+ Configuration configuration )
425+ {
426+ this . bounds = bounds ;
427+ this . targetPixels = targetPixels ;
428+ this . sourceValues = sourceValues ;
429+ this . configuration = configuration ;
430+ }
431+
432+ /// <inheritdoc/>
433+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
434+ public unsafe void Invoke ( int y )
435+ {
436+ Vector4 low = Vector4 . Zero ;
437+ var high = new Vector4 ( float . PositiveInfinity , float . PositiveInfinity , float . PositiveInfinity , float . PositiveInfinity ) ;
438+
439+ Span < TPixel > targetPixelSpan = this . targetPixels . GetRowSpan ( y ) . Slice ( this . bounds . X ) ;
440+ Span < Vector4 > sourceRowSpan = this . sourceValues . GetRowSpan ( y ) . Slice ( this . bounds . X ) ;
441+ ref Vector4 sourceRef = ref MemoryMarshal . GetReference ( sourceRowSpan ) ;
442+
443+ for ( int x = 0 ; x < this . bounds . Width ; x ++ )
444+ {
445+ ref Vector4 v = ref Unsafe . Add ( ref sourceRef , x ) ;
446+ Vector4 clamp = Numerics . Clamp ( v , low , high ) ;
447+
448+ double
449+ x64 = clamp . X ,
450+ y64 = clamp . Y ,
451+ z64 = clamp . Z ;
452+ float a = clamp . W ;
453+
454+ ulong
455+ xl = * ( ulong * ) & x64 ,
456+ yl = * ( ulong * ) & y64 ,
457+ zl = * ( ulong * ) & z64 ;
458+
459+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
460+ // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
461+ // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
462+ // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
463+ // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
464+ // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
465+ xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
466+ yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
467+ zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
468+
469+ Vector4 y4 ;
470+ y4. X = ( float ) * ( double * ) & xl ;
471+ y4. Y = ( float ) * ( double * ) & yl ;
472+ y4. Z = ( float ) * ( double * ) & zl ;
473+ y4. W = 0 ;
474+
475+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( clamp / ( y4 * y4 ) ) ) ;
476+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( clamp / ( y4 * y4 ) ) ) ;
477+ y4 . W = a ;
478+
479+ v = y4 ;
480+ }
481+
482+ PixelOperations < TPixel > . Instance . FromVector4Destructive ( this . configuration , sourceRowSpan . Slice ( 0 , this . bounds . Width ) , targetPixelSpan , PixelConversionModifiers . Premultiply ) ;
483+ }
484+ }
338485 }
339486}
0 commit comments