@@ -611,87 +611,146 @@ public void TransposeInto(ref Block8x8F d)
611611#if SUPPORTS_RUNTIME_INTRINSICS
612612 if ( Avx . IsSupported )
613613 {
614- this . TransposeIntoAvx ( ref d ) ;
614+ // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
615+ Vector256 < float > r0 = Avx . InsertVector128 (
616+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V0L ) . ToVector256 ( ) ,
617+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V4L ) ,
618+ 1 ) ;
619+
620+ Vector256 < float > r1 = Avx . InsertVector128 (
621+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V1L ) . ToVector256 ( ) ,
622+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V5L ) ,
623+ 1 ) ;
624+
625+ Vector256 < float > r2 = Avx . InsertVector128 (
626+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V2L ) . ToVector256 ( ) ,
627+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V6L ) ,
628+ 1 ) ;
629+
630+ Vector256 < float > r3 = Avx . InsertVector128 (
631+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V3L ) . ToVector256 ( ) ,
632+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V7L ) ,
633+ 1 ) ;
634+
635+ Vector256 < float > r4 = Avx . InsertVector128 (
636+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V0R ) . ToVector256 ( ) ,
637+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V4R ) ,
638+ 1 ) ;
639+
640+ Vector256 < float > r5 = Avx . InsertVector128 (
641+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V1R ) . ToVector256 ( ) ,
642+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V5R ) ,
643+ 1 ) ;
644+
645+ Vector256 < float > r6 = Avx . InsertVector128 (
646+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V2R ) . ToVector256 ( ) ,
647+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V6R ) ,
648+ 1 ) ;
649+
650+ Vector256 < float > r7 = Avx . InsertVector128 (
651+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V3R ) . ToVector256 ( ) ,
652+ Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V7R ) ,
653+ 1 ) ;
654+
655+ Vector256 < float > t0 = Avx . UnpackLow ( r0 , r1 ) ;
656+ Vector256 < float > t2 = Avx . UnpackLow ( r2 , r3 ) ;
657+ Vector256 < float > v = Avx . Shuffle ( t0 , t2 , 0x4E ) ;
658+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V0L ) = Avx. Blend ( t0 , v , 0xCC ) ;
659+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V1L ) = Avx. Blend ( t2 , v , 0x33 ) ;
660+
661+ Vector256 < float > t4 = Avx . UnpackLow ( r4 , r5 ) ;
662+ Vector256 < float > t6 = Avx . UnpackLow ( r6 , r7 ) ;
663+ v = Avx . Shuffle ( t4 , t6 , 0x4E ) ;
664+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V4L ) = Avx. Blend ( t4 , v , 0xCC ) ;
665+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V5L ) = Avx. Blend ( t6 , v , 0x33 ) ;
666+
667+ Vector256 < float > t1 = Avx . UnpackHigh ( r0 , r1 ) ;
668+ Vector256 < float > t3 = Avx . UnpackHigh ( r2 , r3 ) ;
669+ v = Avx . Shuffle ( t1 , t3 , 0x4E ) ;
670+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V2L ) = Avx. Blend ( t1 , v , 0xCC ) ;
671+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V3L ) = Avx. Blend ( t3 , v , 0x33 ) ;
672+
673+ Vector256 < float > t5 = Avx . UnpackHigh ( r4 , r5 ) ;
674+ Vector256 < float > t7 = Avx . UnpackHigh ( r6 , r7 ) ;
675+ v = Avx . Shuffle ( t5 , t7 , 0x4E ) ;
676+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V6L ) = Avx. Blend ( t5 , v , 0xCC ) ;
677+ Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V7L ) = Avx. Blend ( t7 , v , 0x33 ) ;
615678 }
616679 else
617680#endif
618681 {
619- this . TransposeIntoFallback ( ref d ) ;
682+ d . V0L . X = this . V0L . X ;
683+ d . V1L . X = this . V0L . Y ;
684+ d . V2L . X = this . V0L . Z ;
685+ d . V3L . X = this . V0L . W ;
686+ d . V4L . X = this . V0R . X ;
687+ d . V5L . X = this . V0R . Y ;
688+ d . V6L . X = this . V0R . Z ;
689+ d . V7L . X = this . V0R . W ;
690+
691+ d . V0L . Y = this . V1L . X ;
692+ d . V1L . Y = this . V1L . Y ;
693+ d . V2L . Y = this . V1L . Z ;
694+ d . V3L . Y = this . V1L . W ;
695+ d . V4L . Y = this . V1R . X ;
696+ d . V5L . Y = this . V1R . Y ;
697+ d . V6L . Y = this . V1R . Z ;
698+ d . V7L . Y = this . V1R . W ;
699+
700+ d . V0L . Z = this . V2L . X ;
701+ d . V1L . Z = this . V2L . Y ;
702+ d . V2L . Z = this . V2L . Z ;
703+ d . V3L . Z = this . V2L . W ;
704+ d . V4L . Z = this . V2R . X ;
705+ d . V5L . Z = this . V2R . Y ;
706+ d . V6L . Z = this . V2R . Z ;
707+ d . V7L . Z = this . V2R . W ;
708+
709+ d . V0L . W = this . V3L . X ;
710+ d . V1L . W = this . V3L . Y ;
711+ d . V2L . W = this . V3L . Z ;
712+ d . V3L . W = this . V3L . W ;
713+ d . V4L . W = this . V3R . X ;
714+ d . V5L . W = this . V3R . Y ;
715+ d . V6L . W = this . V3R . Z ;
716+ d . V7L . W = this . V3R . W ;
717+
718+ d . V0R . X = this . V4L . X ;
719+ d . V1R . X = this . V4L . Y ;
720+ d . V2R . X = this . V4L . Z ;
721+ d . V3R . X = this . V4L . W ;
722+ d . V4R . X = this . V4R . X ;
723+ d . V5R . X = this . V4R . Y ;
724+ d . V6R . X = this . V4R . Z ;
725+ d . V7R . X = this . V4R . W ;
726+
727+ d . V0R . Y = this . V5L . X ;
728+ d . V1R . Y = this . V5L . Y ;
729+ d . V2R . Y = this . V5L . Z ;
730+ d . V3R . Y = this . V5L . W ;
731+ d . V4R . Y = this . V5R . X ;
732+ d . V5R . Y = this . V5R . Y ;
733+ d . V6R . Y = this . V5R . Z ;
734+ d . V7R . Y = this . V5R . W ;
735+
736+ d . V0R . Z = this . V6L . X ;
737+ d . V1R . Z = this . V6L . Y ;
738+ d . V2R . Z = this . V6L . Z ;
739+ d . V3R . Z = this . V6L . W ;
740+ d . V4R . Z = this . V6R . X ;
741+ d . V5R . Z = this . V6R . Y ;
742+ d . V6R . Z = this . V6R . Z ;
743+ d . V7R . Z = this . V6R . W ;
744+
745+ d . V0R . W = this . V7L . X ;
746+ d . V1R . W = this . V7L . Y ;
747+ d . V2R . W = this . V7L . Z ;
748+ d . V3R . W = this . V7L . W ;
749+ d . V4R . W = this . V7R . X ;
750+ d . V5R . W = this . V7R . Y ;
751+ d . V6R . W = this . V7R . Z ;
752+ d . V7R . W = this . V7R . W ;
620753 }
621754 }
622-
623- #if SUPPORTS_RUNTIME_INTRINSICS
624- /// <summary>
625- /// AVX-only variant for executing <see cref="TransposeInto(ref Block8x8F)"/>.
626- /// <see href="https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536"/>
627- /// </summary>
628- [ MethodImpl ( InliningOptions . ShortMethod ) ]
629- public void TransposeIntoAvx ( ref Block8x8F d )
630- {
631- Vector256 < float > r0 = Avx . InsertVector128 (
632- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V0L ) . ToVector256 ( ) ,
633- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V4L ) ,
634- 1 ) ;
635-
636- Vector256 < float > r1 = Avx . InsertVector128 (
637- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V1L ) . ToVector256 ( ) ,
638- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V5L ) ,
639- 1 ) ;
640-
641- Vector256 < float > r2 = Avx . InsertVector128 (
642- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V2L ) . ToVector256 ( ) ,
643- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V6L ) ,
644- 1 ) ;
645-
646- Vector256 < float > r3 = Avx . InsertVector128 (
647- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V3L ) . ToVector256 ( ) ,
648- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V7L ) ,
649- 1 ) ;
650-
651- Vector256 < float > r4 = Avx . InsertVector128 (
652- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V0R ) . ToVector256 ( ) ,
653- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V4R ) ,
654- 1 ) ;
655-
656- Vector256 < float > r5 = Avx . InsertVector128 (
657- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V1R ) . ToVector256 ( ) ,
658- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V5R ) ,
659- 1 ) ;
660-
661- Vector256 < float > r6 = Avx . InsertVector128 (
662- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V2R ) . ToVector256 ( ) ,
663- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V6R ) ,
664- 1 ) ;
665-
666- Vector256 < float > r7 = Avx . InsertVector128 (
667- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V3R ) . ToVector256 ( ) ,
668- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V7R ) ,
669- 1 ) ;
670-
671- Vector256 < float > t0 = Avx . UnpackLow ( r0 , r1 ) ;
672- Vector256 < float > t2 = Avx . UnpackLow ( r2 , r3 ) ;
673- Vector256 < float > v = Avx . Shuffle ( t0 , t2 , 0x4E ) ;
674- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V0L ) = Avx. Blend ( t0 , v , 0xCC ) ;
675- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V1L ) = Avx. Blend ( t2 , v , 0x33 ) ;
676-
677- Vector256 < float > t4 = Avx . UnpackLow ( r4 , r5 ) ;
678- Vector256 < float > t6 = Avx . UnpackLow ( r6 , r7 ) ;
679- v = Avx . Shuffle ( t4 , t6 , 0x4E ) ;
680- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V4L ) = Avx. Blend ( t4 , v , 0xCC ) ;
681- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V5L ) = Avx. Blend ( t6 , v , 0x33 ) ;
682-
683- Vector256 < float > t1 = Avx . UnpackHigh ( r0 , r1 ) ;
684- Vector256 < float > t3 = Avx . UnpackHigh ( r2 , r3 ) ;
685- v = Avx . Shuffle ( t1 , t3 , 0x4E ) ;
686- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V2L ) = Avx. Blend ( t1 , v , 0xCC ) ;
687- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V3L ) = Avx. Blend ( t3 , v , 0x33 ) ;
688-
689- Vector256 < float > t5 = Avx . UnpackHigh ( r4 , r5 ) ;
690- Vector256 < float > t7 = Avx . UnpackHigh ( r6 , r7 ) ;
691- v = Avx . Shuffle ( t5 , t7 , 0x4E ) ;
692- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V6L ) = Avx. Blend ( t5 , v , 0xCC ) ;
693- Unsafe . As < Vector4 , Vector256 < float > > ( ref d . V7L ) = Avx. Blend ( t7 , v , 0x33 ) ;
694- }
695- #endif
696755 }
697756}
0 commit comments