@@ -704,28 +704,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
704704 // a20 a21 a22 a23 b20 b21 b22 b23
705705 // a30 a31 a32 a33 b30 b31 b32 b33
706706 // Transpose the two 4x4.
707- Vector128 < short > transpose00 = Sse2 . UnpackLow ( b0 , b1 ) ;
708- Vector128 < short > transpose01 = Sse2 . UnpackLow ( b2 , b3 ) ;
709- Vector128 < short > transpose02 = Sse2 . UnpackHigh ( b0 , b1 ) ;
710- Vector128 < short > transpose03 = Sse2 . UnpackHigh ( b2 , b3 ) ;
711-
712- // a00 a10 a01 a11 a02 a12 a03 a13
713- // a20 a30 a21 a31 a22 a32 a23 a33
714- // b00 b10 b01 b11 b02 b12 b03 b13
715- // b20 b30 b21 b31 b22 b32 b23 b33
716- Vector128 < int > transpose10 = Sse2 . UnpackLow ( transpose00 . AsInt32 ( ) , transpose01 . AsInt32 ( ) ) ;
717- Vector128 < int > transpose11 = Sse2 . UnpackLow ( transpose02 . AsInt32 ( ) , transpose03 . AsInt32 ( ) ) ;
718- Vector128 < int > transpose12 = Sse2 . UnpackHigh ( transpose00 . AsInt32 ( ) , transpose01 . AsInt32 ( ) ) ;
719- Vector128 < int > transpose13 = Sse2 . UnpackHigh ( transpose02 . AsInt32 ( ) , transpose03 . AsInt32 ( ) ) ;
720-
721- // a00 a10 a20 a30 a01 a11 a21 a31
722- // b00 b10 b20 b30 b01 b11 b21 b31
723- // a02 a12 a22 a32 a03 a13 a23 a33
724- // b02 b12 a22 b32 b03 b13 b23 b33
725- Vector128 < long > output0 = Sse2 . UnpackLow ( transpose10 . AsInt64 ( ) , transpose11 . AsInt64 ( ) ) ;
726- Vector128 < long > output1 = Sse2 . UnpackHigh ( transpose10 . AsInt64 ( ) , transpose11 . AsInt64 ( ) ) ;
727- Vector128 < long > output2 = Sse2 . UnpackLow ( transpose12 . AsInt64 ( ) , transpose13 . AsInt64 ( ) ) ;
728- Vector128 < long > output3 = Sse2 . UnpackHigh ( transpose12 . AsInt64 ( ) , transpose13 . AsInt64 ( ) ) ;
707+ Vp8Transpose_2_4x4_16b ( b0 , b1 , b2 , b3 , out Vector128 < long > output0 , out Vector128 < long > output1 , out Vector128 < long > output2 , out Vector128 < long > output3 ) ;
729708
730709 // a00 a10 a20 a30 b00 b10 b20 b30
731710 // a01 a11 a21 a31 b01 b11 b21 b31
@@ -769,6 +748,44 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
769748
770749 return Numerics . ReduceSum ( result ) ;
771750 }
751+
752+ // Transpose two 4x4 16b matrices horizontally stored in registers.
753+ [ MethodImpl ( InliningOptions . ShortMethod ) ]
754+ public static void Vp8Transpose_2_4x4_16b ( Vector128 < short > b0 , Vector128 < short > b1 , Vector128 < short > b2 , Vector128 < short > b3 , out Vector128 < long > output0 , out Vector128 < long > output1 , out Vector128 < long > output2 , out Vector128 < long > output3 )
755+ {
756+ // Transpose the two 4x4.
757+ // a00 a01 a02 a03 b00 b01 b02 b03
758+ // a10 a11 a12 a13 b10 b11 b12 b13
759+ // a20 a21 a22 a23 b20 b21 b22 b23
760+ // a30 a31 a32 a33 b30 b31 b32 b33
761+ Vector128 < short > transpose00 = Sse2 . UnpackLow ( b0 , b1 ) ;
762+ Vector128 < short > transpose01 = Sse2 . UnpackLow ( b2 , b3 ) ;
763+ Vector128 < short > transpose02 = Sse2 . UnpackHigh ( b0 , b1 ) ;
764+ Vector128 < short > transpose03 = Sse2 . UnpackHigh ( b2 , b3 ) ;
765+
766+ // a00 a10 a01 a11 a02 a12 a03 a13
767+ // a20 a30 a21 a31 a22 a32 a23 a33
768+ // b00 b10 b01 b11 b02 b12 b03 b13
769+ // b20 b30 b21 b31 b22 b32 b23 b33
770+ Vector128 < int > transpose10 = Sse2 . UnpackLow ( transpose00 . AsInt32 ( ) , transpose01 . AsInt32 ( ) ) ;
771+ Vector128 < int > transpose11 = Sse2 . UnpackLow ( transpose02 . AsInt32 ( ) , transpose03 . AsInt32 ( ) ) ;
772+ Vector128 < int > transpose12 = Sse2 . UnpackHigh ( transpose00 . AsInt32 ( ) , transpose01 . AsInt32 ( ) ) ;
773+ Vector128 < int > transpose13 = Sse2 . UnpackHigh ( transpose02 . AsInt32 ( ) , transpose03 . AsInt32 ( ) ) ;
774+
775+ // a00 a10 a20 a30 a01 a11 a21 a31
776+ // b00 b10 b20 b30 b01 b11 b21 b31
777+ // a02 a12 a22 a32 a03 a13 a23 a33
778+ // b02 b12 a22 b32 b03 b13 b23 b33
779+ output0 = Sse2 . UnpackLow ( transpose10 . AsInt64 ( ) , transpose11 . AsInt64 ( ) ) ;
780+ output1 = Sse2 . UnpackHigh ( transpose10 . AsInt64 ( ) , transpose11 . AsInt64 ( ) ) ;
781+ output2 = Sse2 . UnpackLow ( transpose12 . AsInt64 ( ) , transpose13 . AsInt64 ( ) ) ;
782+ output3 = Sse2 . UnpackHigh ( transpose12 . AsInt64 ( ) , transpose13 . AsInt64 ( ) ) ;
783+
784+ // a00 a10 a20 a30 b00 b10 b20 b30
785+ // a01 a11 a21 a31 b01 b11 b21 b31
786+ // a02 a12 a22 a32 b02 b12 b22 b32
787+ // a03 a13 a23 a33 b03 b13 b23 b33
788+ }
772789#endif
773790
774791 public static void TransformTwo ( Span < short > src , Span < byte > dst , Span < int > scratch )
0 commit comments