@@ -41,13 +41,11 @@ public static int GreatestCommonDivisor(int a, int b)
4141
4242 /// <summary>
4343 /// Determine the Least Common Multiple (LCM) of two numbers.
44+ /// See https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor.
4445 /// </summary>
4546 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
4647 public static int LeastCommonMultiple ( int a , int b )
47- {
48- // https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor
49- return ( a / GreatestCommonDivisor ( a , b ) ) * b ;
50- }
48+ => a / GreatestCommonDivisor ( a , b ) * b ;
5149
5250 /// <summary>
5351 /// Calculates <paramref name="x"/> % 2
@@ -290,10 +288,14 @@ public static void Clamp(Span<byte> span, byte min, byte max)
290288
291289 if ( remainder . Length > 0 )
292290 {
293- for ( int i = 0 ; i < remainder . Length ; i ++ )
291+ ref byte remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
292+ ref byte remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
293+
294+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
294295 {
295- ref byte v = ref remainder [ i ] ;
296- v = Clamp ( v , min , max ) ;
296+ remainderStart = Clamp ( remainderStart , min , max ) ;
297+
298+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
297299 }
298300 }
299301 }
@@ -311,10 +313,14 @@ public static void Clamp(Span<uint> span, uint min, uint max)
311313
312314 if ( remainder . Length > 0 )
313315 {
314- for ( int i = 0 ; i < remainder . Length ; i ++ )
316+ ref uint remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
317+ ref uint remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
318+
319+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
315320 {
316- ref uint v = ref remainder [ i ] ;
317- v = Clamp ( v , min , max ) ;
321+ remainderStart = Clamp ( remainderStart , min , max ) ;
322+
323+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
318324 }
319325 }
320326 }
@@ -332,10 +338,14 @@ public static void Clamp(Span<int> span, int min, int max)
332338
333339 if ( remainder . Length > 0 )
334340 {
335- for ( int i = 0 ; i < remainder . Length ; i ++ )
341+ ref int remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
342+ ref int remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
343+
344+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
336345 {
337- ref int v = ref remainder [ i ] ;
338- v = Clamp ( v , min , max ) ;
346+ remainderStart = Clamp ( remainderStart , min , max ) ;
347+
348+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
339349 }
340350 }
341351 }
@@ -353,10 +363,14 @@ public static void Clamp(Span<float> span, float min, float max)
353363
354364 if ( remainder . Length > 0 )
355365 {
356- for ( int i = 0 ; i < remainder . Length ; i ++ )
366+ ref float remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
367+ ref float remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
368+
369+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
357370 {
358- ref float v = ref remainder [ i ] ;
359- v = Clamp ( v , min , max ) ;
371+ remainderStart = Clamp ( remainderStart , min , max ) ;
372+
373+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
360374 }
361375 }
362376 }
@@ -374,10 +388,14 @@ public static void Clamp(Span<double> span, double min, double max)
374388
375389 if ( remainder . Length > 0 )
376390 {
377- for ( int i = 0 ; i < remainder . Length ; i ++ )
391+ ref double remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
392+ ref double remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
393+
394+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
378395 {
379- ref double v = ref remainder [ i ] ;
380- v = Clamp ( v , min , max ) ;
396+ remainderStart = Clamp ( remainderStart , min , max ) ;
397+
398+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
381399 }
382400 }
383401 }
@@ -472,10 +490,8 @@ public static void Premultiply(Span<Vector4> vectors)
472490#if SUPPORTS_RUNTIME_INTRINSICS
473491 if ( Avx2 . IsSupported && vectors . Length >= 2 )
474492 {
475- ref Vector256 < float > vectorsBase =
476- ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
477-
478493 // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
494+ ref Vector256 < float > vectorsBase = ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
479495 ref Vector256 < float > vectorsLast = ref Unsafe . Add ( ref vectorsBase , ( IntPtr ) ( ( uint ) vectors . Length / 2u ) ) ;
480496
481497 while ( Unsafe . IsAddressLessThan ( ref vectorsBase , ref vectorsLast ) )
@@ -495,12 +511,14 @@ public static void Premultiply(Span<Vector4> vectors)
495511 else
496512#endif
497513 {
498- ref Vector4 baseRef = ref MemoryMarshal . GetReference ( vectors ) ;
514+ ref Vector4 vectorsStart = ref MemoryMarshal . GetReference ( vectors ) ;
515+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsStart , vectors . Length ) ;
499516
500- for ( int i = 0 ; i < vectors . Length ; i ++ )
517+ while ( Unsafe . IsAddressLessThan ( ref vectorsStart , ref vectorsEnd ) )
501518 {
502- ref Vector4 v = ref Unsafe . Add ( ref baseRef , i ) ;
503- Premultiply ( ref v ) ;
519+ Premultiply ( ref vectorsStart ) ;
520+
521+ vectorsStart = ref Unsafe . Add ( ref vectorsStart , 1 ) ;
504522 }
505523 }
506524 }
@@ -515,10 +533,8 @@ public static void UnPremultiply(Span<Vector4> vectors)
515533#if SUPPORTS_RUNTIME_INTRINSICS
516534 if ( Avx2 . IsSupported && vectors . Length >= 2 )
517535 {
518- ref Vector256 < float > vectorsBase =
519- ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
520-
521536 // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
537+ ref Vector256 < float > vectorsBase = ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
522538 ref Vector256 < float > vectorsLast = ref Unsafe . Add ( ref vectorsBase , ( IntPtr ) ( ( uint ) vectors . Length / 2u ) ) ;
523539
524540 while ( Unsafe . IsAddressLessThan ( ref vectorsBase , ref vectorsLast ) )
@@ -538,12 +554,14 @@ public static void UnPremultiply(Span<Vector4> vectors)
538554 else
539555#endif
540556 {
541- ref Vector4 baseRef = ref MemoryMarshal . GetReference ( vectors ) ;
557+ ref Vector4 vectorsStart = ref MemoryMarshal . GetReference ( vectors ) ;
558+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsStart , vectors . Length ) ;
542559
543- for ( int i = 0 ; i < vectors . Length ; i ++ )
560+ while ( Unsafe . IsAddressLessThan ( ref vectorsStart , ref vectorsEnd ) )
544561 {
545- ref Vector4 v = ref Unsafe . Add ( ref baseRef , i ) ;
546- UnPremultiply ( ref v ) ;
562+ UnPremultiply ( ref vectorsStart ) ;
563+
564+ vectorsStart = ref Unsafe . Add ( ref vectorsStart , 1 ) ;
547565 }
548566 }
549567 }
@@ -633,53 +651,54 @@ public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
633651 vectors128Ref = y4 ;
634652 vectors128Ref = ref Unsafe . Add ( ref vectors128Ref , 1 ) ;
635653 }
636-
637- return ;
638654 }
655+ else
639656#endif
640- ref Vector4 vectorsRef = ref MemoryMarshal . GetReference ( vectors ) ;
641- ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsRef , vectors . Length ) ;
642-
643- // Fallback with scalar preprocessing and vectorized approximation steps
644- while ( Unsafe . IsAddressLessThan ( ref vectorsRef , ref vectorsEnd ) )
645657 {
646- Vector4 v = vectorsRef ;
647-
648- double
649- x64 = v . X ,
650- y64 = v . Y ,
651- z64 = v . Z ;
652- float a = v . W ;
658+ ref Vector4 vectorsRef = ref MemoryMarshal . GetReference ( vectors ) ;
659+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsRef , vectors . Length ) ;
653660
654- ulong
655- xl = * ( ulong * ) & x64 ,
656- yl = * ( ulong * ) & y64 ,
657- zl = * ( ulong * ) & z64 ;
658-
659- // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
660- // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
661- // this means what we actually want is to find the cube root of our clamped values.
662- // For more info on the constant below, see:
663- // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
664- // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
665- // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
666- // register, and use it to accelerate two steps of the Newton approximation using SIMD.
667- xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
668- yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
669- zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
670-
671- Vector4 y4 ;
672- y4. X = ( float ) * ( double * ) & xl ;
673- y4. Y = ( float ) * ( double * ) & yl ;
674- y4. Z = ( float ) * ( double * ) & zl ;
675- y4. W = 0 ;
676-
677- y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
678- y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
679- y4 . W = a ;
680-
681- vectorsRef = y4 ;
682- vectorsRef = ref Unsafe . Add ( ref vectorsRef , 1 ) ;
661+ // Fallback with scalar preprocessing and vectorized approximation steps
662+ while ( Unsafe . IsAddressLessThan ( ref vectorsRef , ref vectorsEnd ) )
663+ {
664+ Vector4 v = vectorsRef ;
665+
666+ double
667+ x64 = v . X ,
668+ y64 = v . Y ,
669+ z64 = v . Z ;
670+ float a = v . W ;
671+
672+ ulong
673+ xl = * ( ulong * ) & x64 ,
674+ yl = * ( ulong * ) & y64 ,
675+ zl = * ( ulong * ) & z64 ;
676+
677+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
678+ // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
679+ // this means what we actually want is to find the cube root of our clamped values.
680+ // For more info on the constant below, see:
681+ // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
682+ // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
683+ // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
684+ // register, and use it to accelerate two steps of the Newton approximation using SIMD.
685+ xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
686+ yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
687+ zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
688+
689+ Vector4 y4 ;
690+ y4 . X = ( float ) * ( double * ) & xl ;
691+ y4. Y = ( float ) * ( double * ) & yl ;
692+ y4. Z = ( float ) * ( double * ) & zl ;
693+ y4. W = 0 ;
694+
695+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
696+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
697+ y4 . W = a ;
698+
699+ vectorsRef = y4 ;
700+ vectorsRef = ref Unsafe . Add ( ref vectorsRef , 1 ) ;
701+ }
683702 }
684703 }
685704 }
0 commit comments