@@ -547,5 +547,140 @@ public static void UnPremultiply(Span<Vector4> vectors)
547547 }
548548 }
549549 }
550+
551+ /// <summary>
552+ /// Calculates the cube pow of all the XYZ channels of the input vectors.
553+ /// </summary>
554+ /// <param name="vectors">The span of vectors</param>
555+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
556+ public static unsafe void CubePowOnXYZ ( Span < Vector4 > vectors )
557+ {
558+ ref Vector4 baseRef = ref MemoryMarshal . GetReference ( vectors ) ;
559+ ref Vector4 endRef = ref Unsafe . Add ( ref baseRef , vectors . Length ) ;
560+
561+ while ( Unsafe . IsAddressLessThan ( ref baseRef , ref endRef ) )
562+ {
563+ Vector4 v = baseRef ;
564+ float a = v . W ;
565+
566+ // Fast path for the default gamma exposure, which is 3. In this case we can skip
567+ // calling Math.Pow 3 times (one per component), as the method is an internal call and
568+ // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
569+ // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
570+ // back to the target index in the temporary span. The whole iteration will get completely
571+ // inlined and traslated into vectorized instructions, with much better performance.
572+ v = v * v * v ;
573+ v . W = a ;
574+
575+ baseRef = v ;
576+ baseRef = ref Unsafe . Add ( ref baseRef , 1 ) ;
577+ }
578+ }
579+
580+ /// <summary>
581+ /// Calculates the cube root of all the XYZ channels of the input vectors.
582+ /// </summary>
583+ /// <param name="vectors">The span of vectors</param>
584+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
585+ public static unsafe void CubeRootOnXYZ ( Span < Vector4 > vectors )
586+ {
587+ #if SUPPORTS_RUNTIME_INTRINSICS
588+ if ( Sse41 . IsSupported )
589+ {
590+ ref Vector128 < float > vectors128Ref = ref Unsafe . As < Vector4 , Vector128 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
591+ ref Vector128 < float > vectors128End = ref Unsafe . Add ( ref vectors128Ref , vectors . Length ) ;
592+
593+ var v128_341 = Vector128 . Create ( 341 ) ;
594+ Vector128 < int > v128_negativeZero = Vector128 . Create ( - 0.0f ) . AsInt32 ( ) ;
595+ Vector128 < int > v128_one = Vector128 . Create ( 1.0f ) . AsInt32 ( ) ;
596+
597+ var v128_13rd = Vector128 . Create ( 1 / 3f ) ;
598+ var v128_23rds = Vector128 . Create ( 2 / 3f ) ;
599+
600+ while ( Unsafe . IsAddressLessThan ( ref vectors128Ref , ref vectors128End ) )
601+ {
602+ Vector128 < float > vecx = vectors128Ref ;
603+ Vector128 < int > veax = vecx . AsInt32 ( ) ;
604+
605+ // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
606+ // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
607+ // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
608+ // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
609+ // using a different succession (the same used below), which should be less unstable due to not having cube pow.
610+ veax = Sse2 . AndNot ( v128_negativeZero , veax ) ;
611+ veax = Sse2 . Subtract ( veax , v128_one ) ;
612+ veax = Sse2 . ShiftRightArithmetic ( veax , 10 ) ;
613+ veax = Sse41 . MultiplyLow ( veax , v128_341 ) ;
614+ veax = Sse2 . Add ( veax , v128_one ) ;
615+ veax = Sse2 . AndNot ( v128_negativeZero , veax ) ;
616+ veax = Sse2 . Or ( veax , Sse2 . And ( vecx . AsInt32 ( ) , v128_negativeZero ) ) ;
617+
618+ Vector128 < float > y4 = veax . AsSingle ( ) ;
619+
620+ if ( Fma . IsSupported )
621+ {
622+ y4 = Fma . MultiplyAdd ( v128_23rds , y4 , Sse . Multiply ( v128_13rd , Sse . Divide ( vecx , Sse . Multiply ( y4 , y4 ) ) ) ) ;
623+ y4 = Fma . MultiplyAdd ( v128_23rds , y4 , Sse . Multiply ( v128_13rd , Sse . Divide ( vecx , Sse . Multiply ( y4 , y4 ) ) ) ) ;
624+ }
625+ else
626+ {
627+ y4 = Sse . Add ( Sse . Multiply ( v128_23rds , y4 ) , Sse . Multiply ( v128_13rd , Sse . Divide ( vecx , Sse . Multiply ( y4 , y4 ) ) ) ) ;
628+ y4 = Sse . Add ( Sse . Multiply ( v128_23rds , y4 ) , Sse . Multiply ( v128_13rd , Sse . Divide ( vecx , Sse . Multiply ( y4 , y4 ) ) ) ) ;
629+ }
630+
631+ y4 = Sse41 . Insert ( y4 , vecx , 0xF0 ) ;
632+
633+ vectors128Ref = y4 ;
634+ vectors128Ref = ref Unsafe . Add ( ref vectors128Ref , 1 ) ;
635+ }
636+
637+ return ;
638+ }
639+ #endif
640+ ref Vector4 vectorsRef = ref MemoryMarshal . GetReference ( vectors ) ;
641+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsRef , vectors . Length ) ;
642+
643+ // Fallback with scalar preprocessing and vectorized approximation steps
644+ while ( Unsafe . IsAddressLessThan ( ref vectorsRef , ref vectorsEnd ) )
645+ {
646+ Vector4 v = vectorsRef ;
647+
648+ double
649+ x64 = v . X ,
650+ y64 = v . Y ,
651+ z64 = v . Z ;
652+ float a = v . W ;
653+
654+ ulong
655+ xl = * ( ulong * ) & x64 ,
656+ yl = * ( ulong * ) & y64 ,
657+ zl = * ( ulong * ) & z64 ;
658+
659+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
660+ // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
661+ // this means what we actually want is to find the cube root of our clamped values.
662+ // For more info on the constant below, see:
663+ // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
664+ // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
665+ // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
666+ // register, and use it to accelerate two steps of the Newton approximation using SIMD.
667+ xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
668+ yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
669+ zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
670+
671+ Vector4 y4 ;
672+ y4. X = ( float ) * ( double * ) & xl ;
673+ y4. Y = ( float ) * ( double * ) & yl ;
674+ y4. Z = ( float ) * ( double * ) & zl ;
675+ y4. W = 0 ;
676+
677+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
678+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
679+ y4 . W = a ;
680+
681+ vectorsRef = y4 ;
682+ vectorsRef = ref Unsafe . Add ( ref vectorsRef , 1 ) ;
683+ }
684+ }
550685 }
551686}
0 commit comments