@@ -485,6 +485,57 @@ public static unsafe void Quantize(
485485 /// <param name="source">The source block.</param>
486486 public static unsafe void Scale16X16To8X8 ( ref Block8x8F destination , ReadOnlySpan < Block8x8F > source )
487487 {
488+ #if SUPPORTS_RUNTIME_INTRINSICS
489+ if ( Avx2 . IsSupported )
490+ {
491+ Scale16X16To8X8Vectorized ( ref destination , source ) ;
492+ return ;
493+ }
494+ #endif
495+
496+ Scale16X16To8X8Scalar ( ref destination , source ) ;
497+ }
498+
499+ private static void Scale16X16To8X8Vectorized ( ref Block8x8F destination , ReadOnlySpan < Block8x8F > source )
500+ {
501+ #if SUPPORTS_RUNTIME_INTRINSICS
502+ Debug . Assert ( Avx2 . IsSupported , "AVX2 is required to execute this method" ) ;
503+
504+ var f2 = Vector256 . Create ( 2f ) ;
505+ var f025 = Vector256 . Create ( 0.25f ) ;
506+ Vector256 < int > switchInnerDoubleWords = Unsafe . As < byte , Vector256 < int > > ( ref MemoryMarshal . GetReference ( SimdUtils . HwIntrinsics . PermuteMaskSwitchInnerDWords8x32 ) ) ;
507+
508+ ref Vector256 < float > in1 = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref MemoryMarshal . GetReference ( source ) ) ;
509+ ref Vector256 < float > in2 = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref Unsafe . Add ( ref MemoryMarshal . GetReference ( source ) , 1 ) ) ;
510+ ref Vector256 < float > destRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref destination ) ;
511+
512+ for ( int i = 0 ; i < 8 ; i ++ )
513+ {
514+ Vector256 < float > a = in1 ;
515+ Vector256 < float > b = Unsafe . Add ( ref in1 , 1 ) ;
516+ Vector256 < float > c = in2 ;
517+ Vector256 < float > d = Unsafe . Add ( ref in2 , 1 ) ;
518+
519+ Vector256 < float > calc1 = Avx . Shuffle ( a , c , 0b10_00_10_00 ) ;
520+ Vector256 < float > calc2 = Avx . Shuffle ( a , c , 0b11_01_11_01 ) ;
521+ Vector256 < float > calc3 = Avx . Shuffle ( b , d , 0b10_00_10_00 ) ;
522+ Vector256 < float > calc4 = Avx . Shuffle ( b , d , 0b11_01_11_01 ) ;
523+
524+ Vector256 < float > sum = Avx . Add ( Avx . Add ( calc1 , calc2 ) , Avx . Add ( calc3 , calc4 ) ) ;
525+ Vector256 < float > add = Avx . Add ( sum , f2 ) ;
526+ Vector256 < float > res = Avx . Multiply ( add , f025 ) ;
527+
528+ destRef = Avx2 . PermuteVar8x32 ( res , switchInnerDoubleWords ) ;
529+ destRef = ref Unsafe . Add ( ref destRef , 1 ) ;
530+
531+ in1 = ref Unsafe . Add ( ref in1 , 2 ) ;
532+ in2 = ref Unsafe . Add ( ref in2 , 2 ) ;
533+ }
534+ #endif
535+ }
536+
537+ private static unsafe void Scale16X16To8X8Scalar ( ref Block8x8F destination , ReadOnlySpan < Block8x8F > source )
538+ {
488539 for ( int i = 0 ; i < 4 ; i ++ )
489540 {
490541 int dstOff = ( ( i & 2 ) << 4 ) | ( ( i & 1 ) << 2 ) ;
0 commit comments