@@ -39,6 +39,10 @@ internal static unsafe class PredictorEncoder
3939
4040 private static readonly Vector128 < byte > CollectColorRedTransformsAndMask = Vector128 . Create ( ( short ) 0xff ) . AsByte ( ) ;
4141
42+ private static readonly Vector256 < byte > CollectColorRedTransformsGreenMask256 = Vector256 . Create ( 0x00ff00 ) . AsByte ( ) ;
43+
44+ private static readonly Vector256 < byte > CollectColorRedTransformsAndMask256 = Vector256 . Create ( ( short ) 0xff ) . AsByte ( ) ;
45+
4246 private static readonly Vector128 < byte > CollectColorBlueTransformsGreenMask = Vector128 . Create ( 0 , 255 , 0 , 255 , 0 , 255 , 0 , 255 , 0 , 255 , 0 , 255 , 0 , 255 , 0 , 255 ) ;
4347
4448 private static readonly Vector128 < byte > CollectColorBlueTransformsGreenBlueMask = Vector128 . Create ( 255 , 255 , 0 , 0 , 255 , 255 , 0 , 0 , 255 , 255 , 0 , 0 , 255 , 255 , 0 , 0 ) ;
@@ -1071,7 +1075,48 @@ private static double GetPredictionCostCrossColorBlue(
10711075 private static void CollectColorRedTransforms ( Span < uint > bgra , int stride , int tileWidth , int tileHeight , int greenToRed , Span < int > histo )
10721076 {
10731077#if SUPPORTS_RUNTIME_INTRINSICS
1074- if ( Sse41 . IsSupported )
1078+ if ( Avx2 . IsSupported && tileWidth > 16 )
1079+ {
1080+ var multsg = Vector256 . Create ( LosslessUtils . Cst5b ( greenToRed ) ) ;
1081+ const int span = 16 ;
1082+ Span < ushort > values = stackalloc ushort [ span ] ;
1083+ for ( int y = 0 ; y < tileHeight ; y ++ )
1084+ {
1085+ Span < uint > srcSpan = bgra . Slice ( y * stride ) ;
1086+ ref uint inputRef = ref MemoryMarshal . GetReference ( srcSpan ) ;
1087+ for ( int x = 0 ; x + span <= tileWidth ; x += span )
1088+ {
1089+ int input0Idx = x ;
1090+ int input1Idx = x + ( span / 2 ) ;
1091+ Vector256 < byte > input0 = Unsafe . As < uint , Vector256 < uint > > ( ref Unsafe . Add ( ref inputRef , input0Idx ) ) . AsByte ( ) ;
1092+ Vector256 < byte > input1 = Unsafe . As < uint , Vector256 < uint > > ( ref Unsafe . Add ( ref inputRef , input1Idx ) ) . AsByte ( ) ;
1093+ Vector256 < byte > g0 = Avx2 . And ( input0 , CollectColorRedTransformsGreenMask256 ) ; // 0 0 | g 0
1094+ Vector256 < byte > g1 = Avx2 . And ( input1 , CollectColorRedTransformsGreenMask256 ) ;
1095+ Vector256 < ushort > g = Avx2 . PackUnsignedSaturate ( g0 . AsInt32 ( ) , g1 . AsInt32 ( ) ) ; // g 0
1096+ Vector256 < int > a0 = Avx2 . ShiftRightLogical ( input0 . AsInt32 ( ) , 16 ) ; // 0 0 | x r
1097+ Vector256 < int > a1 = Avx2 . ShiftRightLogical ( input1 . AsInt32 ( ) , 16 ) ;
1098+ Vector256 < ushort > a = Avx2 . PackUnsignedSaturate ( a0 , a1 ) ; // x r
1099+ Vector256 < short > b = Avx2 . MultiplyHigh ( g . AsInt16 ( ) , multsg ) ; // x dr
1100+ Vector256 < byte > c = Avx2 . Subtract ( a . AsByte ( ) , b . AsByte ( ) ) ; // x r'
1101+ Vector256 < byte > d = Avx2 . And ( c , CollectColorRedTransformsAndMask256 ) ; // 0 r'
1102+
1103+ ref ushort outputRef = ref MemoryMarshal . GetReference ( values ) ;
1104+ Unsafe . As < ushort , Vector256 < ushort > > ( ref outputRef ) = d. AsUInt16 ( ) ;
1105+
1106+ for ( int i = 0 ; i < span ; i ++ )
1107+ {
1108+ ++ histo [ values [ i ] ] ;
1109+ }
1110+ }
1111+ }
1112+
1113+ int leftOver = tileWidth & ( span - 1 ) ;
1114+ if ( leftOver > 0 )
1115+ {
1116+ CollectColorRedTransformsNoneVectorized ( bgra . Slice ( tileWidth - leftOver ) , stride , leftOver , tileHeight , greenToRed , histo ) ;
1117+ }
1118+ }
1119+ else if ( Sse41 . IsSupported )
10751120 {
10761121 var multsg = Vector128 . Create ( LosslessUtils . Cst5b ( greenToRed ) ) ;
10771122 const int span = 8 ;
0 commit comments