@@ -41,13 +41,11 @@ public static int GreatestCommonDivisor(int a, int b)
4141
4242 /// <summary>
4343 /// Determine the Least Common Multiple (LCM) of two numbers.
44+ /// See https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor.
4445 /// </summary>
4546 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
4647 public static int LeastCommonMultiple ( int a , int b )
47- {
48- // https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor
49- return ( a / GreatestCommonDivisor ( a , b ) ) * b ;
50- }
48+ => a / GreatestCommonDivisor ( a , b ) * b ;
5149
5250 /// <summary>
5351 /// Calculates <paramref name="x"/> % 2
@@ -290,10 +288,14 @@ public static void Clamp(Span<byte> span, byte min, byte max)
290288
291289 if ( remainder . Length > 0 )
292290 {
293- for ( int i = 0 ; i < remainder . Length ; i ++ )
291+ ref byte remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
292+ ref byte remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
293+
294+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
294295 {
295- ref byte v = ref remainder [ i ] ;
296- v = Clamp ( v , min , max ) ;
296+ remainderStart = Clamp ( remainderStart , min , max ) ;
297+
298+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
297299 }
298300 }
299301 }
@@ -311,10 +313,14 @@ public static void Clamp(Span<uint> span, uint min, uint max)
311313
312314 if ( remainder . Length > 0 )
313315 {
314- for ( int i = 0 ; i < remainder . Length ; i ++ )
316+ ref uint remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
317+ ref uint remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
318+
319+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
315320 {
316- ref uint v = ref remainder [ i ] ;
317- v = Clamp ( v , min , max ) ;
321+ remainderStart = Clamp ( remainderStart , min , max ) ;
322+
323+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
318324 }
319325 }
320326 }
@@ -332,10 +338,14 @@ public static void Clamp(Span<int> span, int min, int max)
332338
333339 if ( remainder . Length > 0 )
334340 {
335- for ( int i = 0 ; i < remainder . Length ; i ++ )
341+ ref int remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
342+ ref int remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
343+
344+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
336345 {
337- ref int v = ref remainder [ i ] ;
338- v = Clamp ( v , min , max ) ;
346+ remainderStart = Clamp ( remainderStart , min , max ) ;
347+
348+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
339349 }
340350 }
341351 }
@@ -353,10 +363,14 @@ public static void Clamp(Span<float> span, float min, float max)
353363
354364 if ( remainder . Length > 0 )
355365 {
356- for ( int i = 0 ; i < remainder . Length ; i ++ )
366+ ref float remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
367+ ref float remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
368+
369+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
357370 {
358- ref float v = ref remainder [ i ] ;
359- v = Clamp ( v , min , max ) ;
371+ remainderStart = Clamp ( remainderStart , min , max ) ;
372+
373+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
360374 }
361375 }
362376 }
@@ -374,10 +388,14 @@ public static void Clamp(Span<double> span, double min, double max)
374388
375389 if ( remainder . Length > 0 )
376390 {
377- for ( int i = 0 ; i < remainder . Length ; i ++ )
391+ ref double remainderStart = ref MemoryMarshal . GetReference ( remainder ) ;
392+ ref double remainderEnd = ref Unsafe . Add ( ref remainderStart , remainder . Length ) ;
393+
394+ while ( Unsafe . IsAddressLessThan ( ref remainderStart , ref remainderEnd ) )
378395 {
379- ref double v = ref remainder [ i ] ;
380- v = Clamp ( v , min , max ) ;
396+ remainderStart = Clamp ( remainderStart , min , max ) ;
397+
398+ remainderStart = ref Unsafe . Add ( ref remainderStart , 1 ) ;
381399 }
382400 }
383401 }
@@ -407,33 +425,42 @@ private static void ClampImpl<T>(Span<T> span, T min, T max)
407425 where T : unmanaged
408426 {
409427 ref T sRef = ref MemoryMarshal . GetReference ( span ) ;
410- ref Vector < T > vsBase = ref Unsafe . As < T , Vector < T > > ( ref MemoryMarshal . GetReference ( span ) ) ;
411428 var vmin = new Vector < T > ( min ) ;
412429 var vmax = new Vector < T > ( max ) ;
413430
414431 int n = span . Length / Vector < T > . Count ;
415432 int m = Modulo4 ( n ) ;
416433 int u = n - m ;
417434
418- for ( int i = 0 ; i < u ; i += 4 )
419- {
420- ref Vector < T > vs0 = ref Unsafe . Add ( ref vsBase , i ) ;
421- ref Vector < T > vs1 = ref Unsafe . Add ( ref vs0 , 1 ) ;
422- ref Vector < T > vs2 = ref Unsafe . Add ( ref vs0 , 2 ) ;
423- ref Vector < T > vs3 = ref Unsafe . Add ( ref vs0 , 3 ) ;
435+ ref Vector < T > vs0 = ref Unsafe . As < T , Vector < T > > ( ref MemoryMarshal . GetReference ( span ) ) ;
436+ ref Vector < T > vs1 = ref Unsafe . Add ( ref vs0 , 1 ) ;
437+ ref Vector < T > vs2 = ref Unsafe . Add ( ref vs0 , 2 ) ;
438+ ref Vector < T > vs3 = ref Unsafe . Add ( ref vs0 , 3 ) ;
439+ ref Vector < T > vsEnd = ref Unsafe . Add ( ref vs0 , u ) ;
424440
441+ while ( Unsafe . IsAddressLessThan ( ref vs0 , ref vsEnd ) )
442+ {
425443 vs0 = Vector . Min ( Vector . Max ( vmin , vs0 ) , vmax ) ;
426444 vs1 = Vector . Min ( Vector . Max ( vmin , vs1 ) , vmax ) ;
427445 vs2 = Vector . Min ( Vector . Max ( vmin , vs2 ) , vmax ) ;
428446 vs3 = Vector . Min ( Vector . Max ( vmin , vs3 ) , vmax ) ;
447+
448+ vs0 = ref Unsafe . Add ( ref vs0 , 4 ) ;
449+ vs1 = ref Unsafe . Add ( ref vs1 , 4 ) ;
450+ vs2 = ref Unsafe . Add ( ref vs2 , 4 ) ;
451+ vs3 = ref Unsafe . Add ( ref vs3 , 4 ) ;
429452 }
430453
431454 if ( m > 0 )
432455 {
433- for ( int i = u ; i < n ; i ++ )
456+ vs0 = ref vsEnd ;
457+ vsEnd = ref Unsafe . Add ( ref vsEnd , m ) ;
458+
459+ while ( Unsafe . IsAddressLessThan ( ref vs0 , ref vsEnd ) )
434460 {
435- ref Vector < T > vs0 = ref Unsafe . Add ( ref vsBase , i ) ;
436461 vs0 = Vector . Min ( Vector . Max ( vmin , vs0 ) , vmax ) ;
462+
463+ vs0 = ref Unsafe . Add ( ref vs0 , 1 ) ;
437464 }
438465 }
439466 }
@@ -472,10 +499,8 @@ public static void Premultiply(Span<Vector4> vectors)
472499#if SUPPORTS_RUNTIME_INTRINSICS
473500 if ( Avx2 . IsSupported && vectors . Length >= 2 )
474501 {
475- ref Vector256 < float > vectorsBase =
476- ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
477-
478502 // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
503+ ref Vector256 < float > vectorsBase = ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
479504 ref Vector256 < float > vectorsLast = ref Unsafe . Add ( ref vectorsBase , ( IntPtr ) ( ( uint ) vectors . Length / 2u ) ) ;
480505
481506 while ( Unsafe . IsAddressLessThan ( ref vectorsBase , ref vectorsLast ) )
@@ -495,12 +520,14 @@ public static void Premultiply(Span<Vector4> vectors)
495520 else
496521#endif
497522 {
498- ref Vector4 baseRef = ref MemoryMarshal . GetReference ( vectors ) ;
523+ ref Vector4 vectorsStart = ref MemoryMarshal . GetReference ( vectors ) ;
524+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsStart , vectors . Length ) ;
499525
500- for ( int i = 0 ; i < vectors . Length ; i ++ )
526+ while ( Unsafe . IsAddressLessThan ( ref vectorsStart , ref vectorsEnd ) )
501527 {
502- ref Vector4 v = ref Unsafe . Add ( ref baseRef , i ) ;
503- Premultiply ( ref v ) ;
528+ Premultiply ( ref vectorsStart ) ;
529+
530+ vectorsStart = ref Unsafe . Add ( ref vectorsStart , 1 ) ;
504531 }
505532 }
506533 }
@@ -515,10 +542,8 @@ public static void UnPremultiply(Span<Vector4> vectors)
515542#if SUPPORTS_RUNTIME_INTRINSICS
516543 if ( Avx2 . IsSupported && vectors . Length >= 2 )
517544 {
518- ref Vector256 < float > vectorsBase =
519- ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
520-
521545 // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
546+ ref Vector256 < float > vectorsBase = ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( vectors ) ) ;
522547 ref Vector256 < float > vectorsLast = ref Unsafe . Add ( ref vectorsBase , ( IntPtr ) ( ( uint ) vectors . Length / 2u ) ) ;
523548
524549 while ( Unsafe . IsAddressLessThan ( ref vectorsBase , ref vectorsLast ) )
@@ -538,12 +563,14 @@ public static void UnPremultiply(Span<Vector4> vectors)
538563 else
539564#endif
540565 {
541- ref Vector4 baseRef = ref MemoryMarshal . GetReference ( vectors ) ;
566+ ref Vector4 vectorsStart = ref MemoryMarshal . GetReference ( vectors ) ;
567+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsStart , vectors . Length ) ;
542568
543- for ( int i = 0 ; i < vectors . Length ; i ++ )
569+ while ( Unsafe . IsAddressLessThan ( ref vectorsStart , ref vectorsEnd ) )
544570 {
545- ref Vector4 v = ref Unsafe . Add ( ref baseRef , i ) ;
546- UnPremultiply ( ref v ) ;
571+ UnPremultiply ( ref vectorsStart ) ;
572+
573+ vectorsStart = ref Unsafe . Add ( ref vectorsStart , 1 ) ;
547574 }
548575 }
549576 }
@@ -633,53 +660,54 @@ public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
633660 vectors128Ref = y4 ;
634661 vectors128Ref = ref Unsafe . Add ( ref vectors128Ref , 1 ) ;
635662 }
636-
637- return ;
638663 }
664+ else
639665#endif
640- ref Vector4 vectorsRef = ref MemoryMarshal . GetReference ( vectors ) ;
641- ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsRef , vectors . Length ) ;
642-
643- // Fallback with scalar preprocessing and vectorized approximation steps
644- while ( Unsafe . IsAddressLessThan ( ref vectorsRef , ref vectorsEnd ) )
645666 {
646- Vector4 v = vectorsRef ;
667+ ref Vector4 vectorsRef = ref MemoryMarshal . GetReference ( vectors ) ;
668+ ref Vector4 vectorsEnd = ref Unsafe . Add ( ref vectorsRef , vectors . Length ) ;
647669
648- double
649- x64 = v . X ,
650- y64 = v . Y ,
651- z64 = v . Z ;
652- float a = v . W ;
653-
654- ulong
655- xl = * ( ulong * ) & x64 ,
656- yl = * ( ulong * ) & y64 ,
657- zl = * ( ulong * ) & z64 ;
658-
659- // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
660- // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
661- // this means what we actually want is to find the cube root of our clamped values.
662- // For more info on the constant below, see:
663- // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
664- // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
665- // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
666- // register, and use it to accelerate two steps of the Newton approximation using SIMD.
667- xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
668- yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
669- zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
670-
671- Vector4 y4 ;
672- y4. X = ( float ) * ( double * ) & xl ;
673- y4. Y = ( float ) * ( double * ) & yl ;
674- y4. Z = ( float ) * ( double * ) & zl ;
675- y4. W = 0 ;
676-
677- y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
678- y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
679- y4 . W = a ;
680-
681- vectorsRef = y4 ;
682- vectorsRef = ref Unsafe . Add ( ref vectorsRef , 1 ) ;
670+ // Fallback with scalar preprocessing and vectorized approximation steps
671+ while ( Unsafe . IsAddressLessThan ( ref vectorsRef , ref vectorsEnd ) )
672+ {
673+ Vector4 v = vectorsRef ;
674+
675+ double
676+ x64 = v . X ,
677+ y64 = v . Y ,
678+ z64 = v . Z ;
679+ float a = v . W ;
680+
681+ ulong
682+ xl = * ( ulong * ) & x64 ,
683+ yl = * ( ulong * ) & y64 ,
684+ zl = * ( ulong * ) & z64 ;
685+
686+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
687+ // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
688+ // this means what we actually want is to find the cube root of our clamped values.
689+ // For more info on the constant below, see:
690+ // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
691+ // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
692+ // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
693+ // register, and use it to accelerate two steps of the Newton approximation using SIMD.
694+ xl = 0x2a9f8a7be393b600 + ( xl / 3 ) ;
695+ yl = 0x2a9f8a7be393b600 + ( yl / 3 ) ;
696+ zl = 0x2a9f8a7be393b600 + ( zl / 3 ) ;
697+
698+ Vector4 y4 ;
699+ y4 . X = ( float ) * ( double * ) & xl ;
700+ y4. Y = ( float ) * ( double * ) & yl ;
701+ y4. Z = ( float ) * ( double * ) & zl ;
702+ y4. W = 0 ;
703+
704+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
705+ y4 = ( 2 / 3f * y4 ) + ( 1 / 3f * ( v / ( y4 * y4 ) ) ) ;
706+ y4 . W = a ;
707+
708+ vectorsRef = y4 ;
709+ vectorsRef = ref Unsafe . Add ( ref vectorsRef , 1 ) ;
710+ }
683711 }
684712 }
685713 }
0 commit comments