@@ -15,6 +15,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
1515{
1616 internal static class LossyUtils
1717 {
18+ #if SUPPORTS_RUNTIME_INTRINSICS
19+ private static readonly Vector128 < byte > Mean16x4Mask = Vector128 . Create ( ( short ) 0x00ff ) . AsByte ( ) ;
20+ #endif
21+
1822 [ MethodImpl ( InliningOptions . ShortMethod ) ]
1923 public static int Vp8Sse16X16 ( Span < byte > a , Span < byte > b ) => GetSse ( a , b , 16 , 16 ) ;
2024
@@ -975,26 +979,55 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
975979 FilterLoop24 ( v , offsetPlus4 , 1 , stride , 8 , thresh , ithresh , hevThresh ) ;
976980 }
977981
978- [ MethodImpl ( InliningOptions . ShortMethod ) ]
979- public static uint LoadUv ( byte u , byte v ) =>
980- ( uint ) ( u | ( v << 16 ) ) ; // We process u and v together stashed into 32bit(16bit each).
981-
982- [ MethodImpl ( InliningOptions . ShortMethod ) ]
983- public static void YuvToBgr ( int y , int u , int v , Span < byte > bgr )
982+ public static void Mean16x4 ( Span < byte > input , Span < uint > dc )
984983 {
985- bgr [ 0 ] = ( byte ) YuvToB ( y , u ) ;
986- bgr [ 1 ] = ( byte ) YuvToG ( y , u , v ) ;
987- bgr [ 2 ] = ( byte ) YuvToR ( y , v ) ;
988- }
989-
990- [ MethodImpl ( InliningOptions . ShortMethod ) ]
991- public static int YuvToB ( int y , int u ) => Clip8 ( MultHi ( y , 19077 ) + MultHi ( u , 33050 ) - 17685 ) ;
992-
993- [ MethodImpl ( InliningOptions . ShortMethod ) ]
994- public static int YuvToG ( int y , int u , int v ) => Clip8 ( MultHi ( y , 19077 ) - MultHi ( u , 6419 ) - MultHi ( v , 13320 ) + 8708 ) ;
984+ #if SUPPORTS_RUNTIME_INTRINSICS
985+ if ( Ssse3 . IsSupported )
986+ {
987+ Vector128 < byte > a0 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( input ) ) ;
988+ Vector128 < byte > a1 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( input . Slice ( WebpConstants . Bps , 16 ) ) ) ;
989+ Vector128 < byte > a2 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( input . Slice ( WebpConstants . Bps * 2 , 16 ) ) ) ;
990+ Vector128 < byte > a3 = Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( input . Slice ( WebpConstants . Bps * 3 , 16 ) ) ) ;
991+ Vector128 < short > b0 = Sse2 . ShiftRightLogical ( a0 . AsInt16 ( ) , 8 ) ; // hi byte
992+ Vector128 < short > b1 = Sse2 . ShiftRightLogical ( a1 . AsInt16 ( ) , 8 ) ;
993+ Vector128 < short > b2 = Sse2 . ShiftRightLogical ( a2 . AsInt16 ( ) , 8 ) ;
994+ Vector128 < short > b3 = Sse2 . ShiftRightLogical ( a3 . AsInt16 ( ) , 8 ) ;
995+ Vector128 < byte > c0 = Sse2 . And ( a0 , Mean16x4Mask ) ; // lo byte
996+ Vector128 < byte > c1 = Sse2 . And ( a1 , Mean16x4Mask ) ;
997+ Vector128 < byte > c2 = Sse2 . And ( a2 , Mean16x4Mask ) ;
998+ Vector128 < byte > c3 = Sse2 . And ( a3 , Mean16x4Mask ) ;
999+ Vector128 < int > d0 = Sse2 . Add ( b0 . AsInt32 ( ) , c0 . AsInt32 ( ) ) ;
1000+ Vector128 < int > d1 = Sse2 . Add ( b1 . AsInt32 ( ) , c1 . AsInt32 ( ) ) ;
1001+ Vector128 < int > d2 = Sse2 . Add ( b2 . AsInt32 ( ) , c2 . AsInt32 ( ) ) ;
1002+ Vector128 < int > d3 = Sse2 . Add ( b3 . AsInt32 ( ) , c3 . AsInt32 ( ) ) ;
1003+ Vector128 < int > e0 = Sse2 . Add ( d0 , d1 ) ;
1004+ Vector128 < int > e1 = Sse2 . Add ( d2 , d3 ) ;
1005+ Vector128 < int > f0 = Sse2 . Add ( e0 , e1 ) ;
1006+ Vector128 < short > hadd = Ssse3 . HorizontalAdd ( f0 . AsInt16 ( ) , f0 . AsInt16 ( ) ) ;
1007+ Vector128 < uint > wide = Sse2 . UnpackLow ( hadd , Vector128 < short > . Zero ) . AsUInt32 ( ) ;
1008+
1009+ ref uint outputRef = ref MemoryMarshal . GetReference ( dc ) ;
1010+ Unsafe . As < uint , Vector128 < uint > > ( ref outputRef ) = wide;
1011+ }
1012+ else
1013+ #endif
1014+ {
1015+ for ( int k = 0 ; k < 4 ; k ++ )
1016+ {
1017+ uint avg = 0 ;
1018+ for ( int y = 0 ; y < 4 ; y ++ )
1019+ {
1020+ for ( int x = 0 ; x < 4 ; x ++ )
1021+ {
1022+ avg += input [ x + ( y * WebpConstants . Bps ) ] ;
1023+ }
1024+ }
9951025
996- [ MethodImpl ( InliningOptions . ShortMethod ) ]
997- public static int YuvToR ( int y , int v ) => Clip8 ( MultHi ( y , 19077 ) + MultHi ( v , 26149 ) - 14234 ) ;
1026+ dc [ k ] = avg ;
1027+ input = input . Slice ( 4 ) ; // go to next 4x4 block.
1028+ }
1029+ }
1030+ }
9981031
9991032 [ MethodImpl ( InliningOptions . ShortMethod ) ]
10001033 public static byte Avg2 ( byte a , byte b ) => ( byte ) ( ( a + b + 1 ) >> 1 ) ;
@@ -1200,9 +1233,6 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh)
12001233 return WebpLookupTables . Abs0 ( p1 - p0 ) > thresh || WebpLookupTables . Abs0 ( q1 - q0 ) > thresh ;
12011234 }
12021235
1203- [ MethodImpl ( InliningOptions . ShortMethod ) ]
1204- private static int MultHi ( int v , int coeff ) => ( v * coeff ) >> 8 ;
1205-
12061236 [ MethodImpl ( InliningOptions . ShortMethod ) ]
12071237 private static void Store ( Span < byte > dst , int x , int y , int v )
12081238 {
@@ -1225,13 +1255,6 @@ private static void Store2(Span<byte> dst, int y, int dc, int d, int c)
12251255 [ MethodImpl ( InliningOptions . ShortMethod ) ]
12261256 private static int Mul2 ( int a ) => ( a * 35468 ) >> 16 ;
12271257
1228- [ MethodImpl ( InliningOptions . ShortMethod ) ]
1229- private static byte Clip8 ( int v )
1230- {
1231- int yuvMask = ( 256 << 6 ) - 1 ;
1232- return ( byte ) ( ( v & ~ yuvMask ) == 0 ? v >> 6 : v < 0 ? 0 : 255 ) ;
1233- }
1234-
12351258 [ MethodImpl ( InliningOptions . ShortMethod ) ]
12361259 private static void Put8x8uv ( byte value , Span < byte > dst )
12371260 {
0 commit comments