@@ -547,5 +547,131 @@ public static void UnPremultiply(Span<Vector4> vectors)
547547 }
548548 }
549549 }
550+
551+ /// <summary>
552+ /// Calculates the cube pow of all the XYZ channels of the input vectors.
553+ /// </summary>
554+ /// <param name="vectors">The span of vectors</param>
555+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
556+ public static unsafe void CubePowOnXYZ ( Span < Vector4 > vectors )
557+ {
558+ ref Vector4 baseRef = ref MemoryMarshal . GetReference ( vectors ) ;
559+ int length = vectors . Length ;
560+
561+ for ( int x = 0 ; x < length ; x ++ )
562+ {
563+ ref Vector4 pixel4 = ref Unsafe . Add ( ref baseRef , x ) ;
564+ Vector4 v = pixel4 ;
565+ float a = v . W ;
566+
567+ // Fast path for the default gamma exposure, which is 3. In this case we can skip
568+ // calling Math.Pow 3 times (one per component), as the method is an internal call and
569+ // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
570+ // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
571+ // back to the target index in the temporary span. The whole iteration will get completely
572+ // inlined and traslated into vectorized instructions, with much better performance.
573+ v = v * v * v ;
574+ v . W = a ;
575+
576+ pixel4 = v ;
577+ }
578+ }
579+
580+ /// <summary>
581+ /// Calculates the cube root of all the XYZ channels of the input vectors.
582+ /// </summary>
583+ /// <param name="vectors">The span of vectors</param>
584+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
585+ public static unsafe void CubeRootOnXYZ ( Span < Vector4 > vectors )
586+ {
587+ ref Vector4 vectorsRef = ref MemoryMarshal . GetReference ( vectors ) ;
588+ int length = vectors . Length ;
589+
590+ #if SUPPORTS_RUNTIME_INTRINSICS
591+ if ( Sse41 . IsSupported )
592+ {
593+ var v128_0x7FFFFFFF = Vector128 . Create ( 0x7FFFFFFF ) ;
594+ var v128_0x3F8000000 = Vector128 . Create ( 0x3F800000 ) ;
595+ var v128_341 = Vector128 . Create ( 341 ) ;
596+ var v128_0x80000000 = Vector128 . Create ( unchecked ( ( int ) 0x80000000 ) ) ;
597+ var v4_23rds = new Vector4 ( 2 / 3f ) ;
598+ var v4_13rds = new Vector4 ( 1 / 3f ) ;
599+
600+ for ( int x = 0 ; x < length ; x ++ )
601+ {
602+ ref Vector4 v4 = ref Unsafe . Add ( ref vectorsRef , x ) ;
603+
604+ Vector4 vx = v4 ;
605+ float a = vx . W ;
606+ Vector128 < int > veax = Unsafe . As < Vector4 , Vector128 < int > > ( ref vx ) ;
607+ Vector128 < int > vecx = veax ;
608+
609+ // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
610+ // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
611+ // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
612+ // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
613+ // using a different succession (the same used below), which should be less unstable due to not having cube pow.
614+ veax = Sse2 . And ( veax , v128_0x7FFFFFFF ) ;
615+ veax = Sse2 . Subtract ( veax , v128_0x3F8000000 ) ;
616+ veax = Sse2 . ShiftRightArithmetic ( veax , 10 ) ;
617+ veax = Sse41 . MultiplyLow ( veax , v128_341 ) ;
618+ veax = Sse2 . Add ( veax , v128_0x3F8000000 ) ;
619+ veax = Sse2 . And ( veax , v128_0x7FFFFFFF ) ;
620+ vecx = Sse2 . And ( vecx , v128_0x80000000 ) ;
621+ veax = Sse2 . Or ( veax , vecx ) ;
622+
623+ Vector4 y4 = * ( Vector4 * ) & veax ;
624+
625+ y4 = ( v4_23rds * y4 ) + ( v4_13rds * ( vx / ( y4 * y4 ) ) ) ;
626+ y4 = ( v4_23rds * y4 ) + ( v4_13rds * ( vx / ( y4 * y4 ) ) ) ;
627+ y4. W = a ;
628+
629+ v4 = y4 ;
630+ }
631+
632+ return ;
633+ }
634+ #else
635+ for ( int x = 0 ; x < length ; x++ )
636+ {
637+ ref Vector4 v = ref Unsafe . Add ( ref vectorsRef , x ) ;
638+
639+ double
640+ x64 = v . X ,
641+ y64 = v . Y ,
642+ z64 = v . Z ;
643+ float a = v. W;
644+
645+ ulong
646+ xl = * ( ulong * ) & x64,
647+ yl = * ( ulong * ) & y64 ,
648+ zl = * ( ulong * ) & z64 ;
649+
650+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
651+ // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
652+ // this means what we actually want is to find the cube root of our clamped values.
653+ // For more info on the constant below, see:
654+ // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
655+ // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
656+ // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
657+ // register, and use it to accelerate two steps of the Newton approximation using SIMD.
658+ xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
659+ yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
660+ zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
661+
662+ Vector4 y4;
663+ y4 . X = ( float ) * ( double * ) & xl;
664+ y4 . Y = ( float ) * ( double * ) & yl;
665+ y4 . Z = ( float ) * ( double * ) & zl;
666+ y4 . W = 0 ;
667+
668+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
669+ y4 = ( 2 / 3f * y4) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
670+ y4 . W = a ;
671+
672+ v = y4 ;
673+ }
674+ #endif
675+ }
550676 }
551677}
0 commit comments